Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 41 additions & 14 deletions litstudy/sources/semanticscholar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from time import sleep
from typing import Tuple, Optional
from urllib.parse import urlencode, quote_plus

from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import logging
import requests
import shelve
Expand Down Expand Up @@ -90,38 +93,52 @@ def load(id):
return fetch_semanticscholar(id)


S2_PAPER_URL = "https://api.semanticscholar.org/v1/paper/"
S2_PAPER_URL = "https://api.semanticscholar.org/graph/v1/paper/"
S2_QUERY_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
CACHE_FILE = ".semantischolar"
DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes
CACHE_FILE = ".semanticscholar"
MAX_RETRIES = 3
BACKOFF_FACTOR = 1.05


def get_retry_session():
session = requests.Session()
retry_strategy = Retry(
total=MAX_RETRIES,
backoff_factor=BACKOFF_FACTOR,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session

def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT):

def request_query(query, offset, limit, cache, session):
params = urlencode(dict(query=query, offset=offset, limit=limit))
url = f"{S2_QUERY_URL}?{params}"

if url in cache:
return cache[url]

reply = session.get(url)
response = reply.json()

if "data" not in response:
msg = response.get("error") or response.get("message") or "unknown"
raise Exception(f"error while fetching {reply.url}: {msg}")
try:
response = session.get(url).json()
if "data" not in response:
msg = response.get("error") or response.get("message") or "unknown"
raise Exception(f"error while fetching {url}: {msg}")
except Exception as e:
raise Exception(f"error while fetching {url}: {e}")

cache[url] = response
return response


def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT):
def request_paper(key, cache, session):
url = S2_PAPER_URL + quote_plus(key)

if url in cache:
return cache[url]

try:
sleep(timeout)
data = session.get(url).json()
except Exception as e:
logging.warning(f"failed to retrieve {key}: {e}")
Expand Down Expand Up @@ -156,8 +173,10 @@ def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]:
if key is None:
return None

should_close_session = False
if session is None:
session = requests.Session()
session = get_retry_session()
should_close_session = True

with shelve.open(CACHE_FILE) as cache:
if isinstance(key, DocumentIdentifier):
Expand All @@ -176,6 +195,9 @@ def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]:
else:
data = request_paper(key, cache, session)

if should_close_session:
session.close()

if data is None:
return None

Expand Down Expand Up @@ -217,8 +239,10 @@ def search_semanticscholar(
if not query:
raise Exception("no query specified in `search_semanticscholar`")

should_close_session = False
if session is None:
session = requests.Session()
session = get_retry_session()
should_close_session = True

docs = []

Expand Down Expand Up @@ -255,4 +279,7 @@ def search_semanticscholar(
else:
logging.warn(f"could not find paper id {paper_id}")

if should_close_session:
session.close()

return DocumentSet(docs)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified tests/requests/78a2bca757d42aced207b202458dfd3bf17f0c3d.pickle
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_sources_semanticscholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_load_s2_file():
assert len(docs) == 3

docs = search_semanticscholar("litstudy", session=session)
assert any(doc.id.doi == "10.2139/ssrn.4079400" for doc in docs)
assert any(doc.id.s2id == "deb329f47bfd3cc053410af62403edf99cfe7f6c" for doc in docs)


def test_fetch_semanticscholar():
Expand Down