Skip to content

Commit

Permalink
Graceful handling of connection errors (#35)
Browse files Browse the repository at this point in the history
* feat: gracefully retry if connection failture

* doc: version bump
  • Loading branch information
jannisborn authored Dec 8, 2023
1 parent a8d4f50 commit db4f0c1
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 19 deletions.
2 changes: 1 addition & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Initialize the module."""
__name__ = "paperscraper"
__version__ = "0.2.7"
__version__ = "0.2.8"

import logging
import os
Expand Down
6 changes: 3 additions & 3 deletions paperscraper/get_dumps/chemrxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ def chemrxiv(
stored in jsonl format in save_path.
Args:
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
begin_date (Optional[str]): begin date expressed as YYYY-MM-DD.
Defaults to None.
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
end_date (Optional[str]): end date expressed as YYYY-MM-DD.
Defaults to None.
save_path (str, optional): Path where the dump is stored.
save_path (str): Path where the dump is stored.
Defaults to save_path.
"""

Expand Down
57 changes: 42 additions & 15 deletions paperscraper/xrxiv/xrxiv_api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""API for bioRxiv and medRXiv."""
import logging
from datetime import datetime
from time import sleep
from typing import Generator, List, Optional

import requests
from requests.exceptions import ConnectionError, Timeout

launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
logger = logging.getLogger(__name__)


class XRXivApi:
Expand Down Expand Up @@ -38,15 +42,17 @@ def get_papers(
begin_date: Optional[str] = None,
end_date: Optional[str] = None,
fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
max_retries: int = 10,
) -> Generator:
"""
Get paper metadata.
Args:
begin_date (Optional[str], optional): begin date. Defaults to None, a.k.a. launch date.
end_date (Optional[str], optional): end date. Defaults to None, a.k.a. today.
begin_date (Optional[str]): begin date. Defaults to None, a.k.a. launch date.
end_date (Optional[str]): end date. Defaults to None, a.k.a. today.
fields (List[str], optional): fields to return per paper.
Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
max_retries (int): Number of retries on connection failure. Defaults to 10.
Yields:
Generator: a generator of paper metadata (dict) with the desired fields.
Expand All @@ -68,20 +74,41 @@ def get_papers(
do_loop = True
cursor = 0
while do_loop:
json_response = requests.get(
self.get_papers_url.format(
begin_date=begin_date, end_date=end_date, cursor=cursor
)
).json()
do_loop = json_response["messages"][0]["status"] == "ok"
if do_loop:
cursor += json_response["messages"][0]["count"]
for paper in json_response["collection"]:
processed_paper = {
field: paper.get(field, "") for field in fields
}
yield processed_paper
papers = []
for attempt in range(max_retries):
try:
json_response = requests.get(
self.get_papers_url.format(
begin_date=begin_date, end_date=end_date, cursor=cursor
)
).json()
do_loop = json_response["messages"][0]["status"] == "ok"
if do_loop:
cursor += json_response["messages"][0]["count"]
for paper in json_response["collection"]:
processed_paper = {
field: paper.get(field, "") for field in fields
}
papers.append(processed_paper)

if do_loop:
yield from papers
break
except (ConnectionError, Timeout) as e:
logger.error(
f"Connection error: {e}. Retrying ({attempt + 1}/{max_retries})"
)
sleep(5)
continue
except Exception as exc:
logger.exception(f"Failed getting papers: {exc}")
raise RuntimeError(
"Failed getting papers: {} - {}".format(
exc.__class__.__name__, exc
)
)
except Exception as exc:
logger.exception(f"Failed getting papers: {exc}")
raise RuntimeError(
"Failed getting papers: {} - {}".format(exc.__class__.__name__, exc)
)
Expand Down

0 comments on commit db4f0c1

Please sign in to comment.