diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 0ab60bd..209eb0a 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -1,6 +1,9 @@ from time import sleep from typing import Tuple, Optional from urllib.parse import urlencode, quote_plus + +from requests.adapters import HTTPAdapter +from urllib3.util import Retry import logging import requests import shelve @@ -90,38 +93,52 @@ def load(id): return fetch_semanticscholar(id) -S2_PAPER_URL = "https://api.semanticscholar.org/v1/paper/" +S2_PAPER_URL = "https://api.semanticscholar.org/graph/v1/paper/" S2_QUERY_URL = "https://api.semanticscholar.org/graph/v1/paper/search" -CACHE_FILE = ".semantischolar" -DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes +CACHE_FILE = ".semanticscholar" +MAX_RETRIES = 3 +BACKOFF_FACTOR = 1.05 + +def get_retry_session(): + session = requests.Session() + retry_strategy = Retry( + total=MAX_RETRIES, + backoff_factor=BACKOFF_FACTOR, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session -def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT): + +def request_query(query, offset, limit, cache, session): params = urlencode(dict(query=query, offset=offset, limit=limit)) url = f"{S2_QUERY_URL}?{params}" if url in cache: return cache[url] - reply = session.get(url) - response = reply.json() - - if "data" not in response: - msg = response.get("error") or response.get("message") or "unknown" - raise Exception(f"error while fetching {reply.url}: {msg}") + try: + response = session.get(url).json() + if "data" not in response: + msg = response.get("error") or response.get("message") or "unknown" + raise Exception(f"error while fetching {url}: {msg}") + except Exception as e: + raise Exception(f"error while fetching {url}: {e}") cache[url] = response return response -def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT): +def request_paper(key, cache, session): url = S2_PAPER_URL + quote_plus(key) if url in cache: return cache[url] try: - sleep(timeout) data = session.get(url).json() except Exception as e: logging.warning(f"failed to retrieve {key}: {e}") @@ -156,8 +173,10 @@ def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]: if key is None: return None + should_close_session = False if session is None: - session = requests.Session() + session = get_retry_session() + should_close_session = True with shelve.open(CACHE_FILE) as cache: if isinstance(key, DocumentIdentifier): @@ -176,6 +195,9 @@ def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]: else: data = request_paper(key, cache, session) + if should_close_session: + session.close() + if data is None: return None @@ -217,8 +239,10 @@ def search_semanticscholar( if not query: raise Exception("no query specified in `search_semanticscholar`") + should_close_session = False if session is None: - session = requests.Session() + session = get_retry_session() + should_close_session = True docs = [] @@ -255,4 +279,7 @@ def search_semanticscholar( else: logging.warn(f"could not find paper id {paper_id}") + if should_close_session: + session.close() + return DocumentSet(docs) diff --git a/tests/requests/22b58eb36e56f9fb4e5d2322f56c1aeb8fcefea2.pickle b/tests/requests/22b58eb36e56f9fb4e5d2322f56c1aeb8fcefea2.pickle new file mode 100644 index 0000000..b716156 Binary files /dev/null and b/tests/requests/22b58eb36e56f9fb4e5d2322f56c1aeb8fcefea2.pickle differ diff --git a/tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle b/tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle index 779d7eb..13d5ef7 100644 Binary files a/tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle and b/tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle differ diff --git a/tests/requests/313edba44d6ac1a460582583beb9be73ff850629.pickle b/tests/requests/313edba44d6ac1a460582583beb9be73ff850629.pickle new file mode 100644 index 0000000..2e61ec4 Binary files /dev/null and b/tests/requests/313edba44d6ac1a460582583beb9be73ff850629.pickle differ diff --git a/tests/requests/38ea16649a6c4d3e3d3a2a9246b25403b0c626bc.pickle b/tests/requests/38ea16649a6c4d3e3d3a2a9246b25403b0c626bc.pickle new file mode 100644 index 0000000..d957bb2 Binary files /dev/null and b/tests/requests/38ea16649a6c4d3e3d3a2a9246b25403b0c626bc.pickle differ diff --git a/tests/requests/4d39f93aa1c9ff4afee9f210b14ade9e5ccf3a58.pickle b/tests/requests/4d39f93aa1c9ff4afee9f210b14ade9e5ccf3a58.pickle deleted file mode 100644 index 48ce083..0000000 Binary files a/tests/requests/4d39f93aa1c9ff4afee9f210b14ade9e5ccf3a58.pickle and /dev/null differ diff --git a/tests/requests/4d7cb2624f13691cb56de261c1df723f59d45d03.pickle b/tests/requests/4d7cb2624f13691cb56de261c1df723f59d45d03.pickle new file mode 100644 index 0000000..cc18e4c Binary files /dev/null and b/tests/requests/4d7cb2624f13691cb56de261c1df723f59d45d03.pickle differ diff --git a/tests/requests/4eea59c658fa3076445495dea5554977b85511ff.pickle b/tests/requests/4eea59c658fa3076445495dea5554977b85511ff.pickle deleted file mode 100644 index e643467..0000000 Binary files a/tests/requests/4eea59c658fa3076445495dea5554977b85511ff.pickle and /dev/null differ diff --git a/tests/requests/5122a6fa38e030c8876096317e7e19aa6534e70a.pickle b/tests/requests/5122a6fa38e030c8876096317e7e19aa6534e70a.pickle deleted file mode 100644 index c4bf027..0000000 Binary files a/tests/requests/5122a6fa38e030c8876096317e7e19aa6534e70a.pickle and /dev/null differ diff --git a/tests/requests/53a29e0080a36665f126d494c23ec8dca32c1e91.pickle b/tests/requests/53a29e0080a36665f126d494c23ec8dca32c1e91.pickle new file mode 100644 index 0000000..095a71b Binary files /dev/null and b/tests/requests/53a29e0080a36665f126d494c23ec8dca32c1e91.pickle differ diff --git a/tests/requests/54f7b47eb9ceb574f63ec43e8b717364b75b3fa7.pickle b/tests/requests/54f7b47eb9ceb574f63ec43e8b717364b75b3fa7.pickle deleted file mode 100644 index 4bfa0d1..0000000 Binary files a/tests/requests/54f7b47eb9ceb574f63ec43e8b717364b75b3fa7.pickle and /dev/null differ diff --git a/tests/requests/5a3c0cac4c7366bda7fc01fd3210bc99bc3f46ad.pickle b/tests/requests/5a3c0cac4c7366bda7fc01fd3210bc99bc3f46ad.pickle new file mode 100644 index 0000000..7a7d430 Binary files /dev/null and b/tests/requests/5a3c0cac4c7366bda7fc01fd3210bc99bc3f46ad.pickle differ diff --git a/tests/requests/5dec8d884e7221dc8cad8d779c23884c91fde749.pickle b/tests/requests/5dec8d884e7221dc8cad8d779c23884c91fde749.pickle deleted file mode 100644 index 28ac1b9..0000000 Binary files a/tests/requests/5dec8d884e7221dc8cad8d779c23884c91fde749.pickle and /dev/null differ diff --git a/tests/requests/6028198cfd0c1f6c2e2b995ed4802d1c42fb07b2.pickle b/tests/requests/6028198cfd0c1f6c2e2b995ed4802d1c42fb07b2.pickle deleted file mode 100644 index 0819189..0000000 Binary files a/tests/requests/6028198cfd0c1f6c2e2b995ed4802d1c42fb07b2.pickle and /dev/null differ diff --git a/tests/requests/69c324521d0d2c90e2f0e3568671dec099601827.pickle b/tests/requests/69c324521d0d2c90e2f0e3568671dec099601827.pickle new file mode 100644 index 0000000..06ab5d1 Binary files /dev/null and b/tests/requests/69c324521d0d2c90e2f0e3568671dec099601827.pickle differ diff --git a/tests/requests/6a9fff6a7064528fd44202e78690da248afa23b7.pickle b/tests/requests/6a9fff6a7064528fd44202e78690da248afa23b7.pickle deleted file mode 100644 index 9f89bce..0000000 Binary files a/tests/requests/6a9fff6a7064528fd44202e78690da248afa23b7.pickle and /dev/null differ diff --git a/tests/requests/78a2bca757d42aced207b202458dfd3bf17f0c3d.pickle b/tests/requests/78a2bca757d42aced207b202458dfd3bf17f0c3d.pickle index 1a7fad4..e43f9ba 100644 Binary files a/tests/requests/78a2bca757d42aced207b202458dfd3bf17f0c3d.pickle and b/tests/requests/78a2bca757d42aced207b202458dfd3bf17f0c3d.pickle differ diff --git a/tests/requests/7c4b7301a9ccb48f60f360dd28ec6f0dfec9613f.pickle b/tests/requests/7c4b7301a9ccb48f60f360dd28ec6f0dfec9613f.pickle deleted file mode 100644 index 63260b7..0000000 Binary files a/tests/requests/7c4b7301a9ccb48f60f360dd28ec6f0dfec9613f.pickle and /dev/null differ diff --git a/tests/requests/81efb45e4c0d622bfdb8a37f5182be2f2e08a88f.pickle b/tests/requests/81efb45e4c0d622bfdb8a37f5182be2f2e08a88f.pickle new file mode 100644 index 0000000..a6ec3e1 Binary files /dev/null and b/tests/requests/81efb45e4c0d622bfdb8a37f5182be2f2e08a88f.pickle differ diff --git a/tests/requests/8e67f62dd8969671c5377565f8ac6017308433d5.pickle b/tests/requests/8e67f62dd8969671c5377565f8ac6017308433d5.pickle new file mode 100644 index 0000000..ac723a0 Binary files /dev/null and b/tests/requests/8e67f62dd8969671c5377565f8ac6017308433d5.pickle differ diff --git a/tests/requests/9b4fc567a80b12bae747c517a4890ce044307aa5.pickle b/tests/requests/9b4fc567a80b12bae747c517a4890ce044307aa5.pickle deleted file mode 100644 index e34f505..0000000 Binary files a/tests/requests/9b4fc567a80b12bae747c517a4890ce044307aa5.pickle and /dev/null differ diff --git a/tests/requests/c3c2090c3a0293d71314d226b24f9da74633e092.pickle b/tests/requests/c3c2090c3a0293d71314d226b24f9da74633e092.pickle deleted file mode 100644 index 60381fe..0000000 Binary files a/tests/requests/c3c2090c3a0293d71314d226b24f9da74633e092.pickle and /dev/null differ diff --git a/tests/requests/c9f32c88a88f59f0e9f468ffed0bb606c55913b6.pickle b/tests/requests/c9f32c88a88f59f0e9f468ffed0bb606c55913b6.pickle new file mode 100644 index 0000000..8baf610 Binary files /dev/null and b/tests/requests/c9f32c88a88f59f0e9f468ffed0bb606c55913b6.pickle differ diff --git a/tests/requests/cfca0170ac37869891777418ee9cf20f17faa581.pickle b/tests/requests/cfca0170ac37869891777418ee9cf20f17faa581.pickle deleted file mode 100644 index b47a041..0000000 Binary files a/tests/requests/cfca0170ac37869891777418ee9cf20f17faa581.pickle and /dev/null differ diff --git a/tests/requests/d1acbf3602743e93bf70589acb072ba87ef3b72b.pickle b/tests/requests/d1acbf3602743e93bf70589acb072ba87ef3b72b.pickle deleted file mode 100644 index fa8846a..0000000 Binary files a/tests/requests/d1acbf3602743e93bf70589acb072ba87ef3b72b.pickle and /dev/null differ diff --git a/tests/requests/e1e0dd5123bdf8442cc908de2e233ef355ee4c77.pickle b/tests/requests/e1e0dd5123bdf8442cc908de2e233ef355ee4c77.pickle new file mode 100644 index 0000000..c4fa325 Binary files /dev/null and b/tests/requests/e1e0dd5123bdf8442cc908de2e233ef355ee4c77.pickle differ diff --git a/tests/requests/ecbda91e09a794eb850a39eff643376ca6e09c4e.pickle b/tests/requests/ecbda91e09a794eb850a39eff643376ca6e09c4e.pickle new file mode 100644 index 0000000..967eb66 Binary files /dev/null and b/tests/requests/ecbda91e09a794eb850a39eff643376ca6e09c4e.pickle differ diff --git a/tests/requests/eeb9079866515efbc22fb670e78f14d11c099a9f.pickle b/tests/requests/eeb9079866515efbc22fb670e78f14d11c099a9f.pickle deleted file mode 100644 index 4aa528f..0000000 Binary files a/tests/requests/eeb9079866515efbc22fb670e78f14d11c099a9f.pickle and /dev/null differ diff --git a/tests/requests/fc3a9c19e6b4e122e77456a469fee6a35c00492b.pickle b/tests/requests/fc3a9c19e6b4e122e77456a469fee6a35c00492b.pickle new file mode 100644 index 0000000..add460b Binary files /dev/null and b/tests/requests/fc3a9c19e6b4e122e77456a469fee6a35c00492b.pickle differ diff --git a/tests/requests/ff2a790d6047bbc6bf3ee8d5cc73c47237b95bf8.pickle b/tests/requests/ff2a790d6047bbc6bf3ee8d5cc73c47237b95bf8.pickle deleted file mode 100644 index c56a566..0000000 Binary files a/tests/requests/ff2a790d6047bbc6bf3ee8d5cc73c47237b95bf8.pickle and /dev/null differ diff --git a/tests/test_sources_semanticscholar.py b/tests/test_sources_semanticscholar.py index a226061..d843127 100644 --- a/tests/test_sources_semanticscholar.py +++ b/tests/test_sources_semanticscholar.py @@ -13,7 +13,7 @@ def test_load_s2_file(): assert len(docs) == 3 docs = search_semanticscholar("litstudy", session=session) - assert any(doc.id.doi == "10.2139/ssrn.4079400" for doc in docs) + assert any(doc.id.s2id == "deb329f47bfd3cc053410af62403edf99cfe7f6c" for doc in docs) def test_fetch_semanticscholar():