diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py index 5ce7001..4d2c3f6 100644 --- a/paperscraper/__init__.py +++ b/paperscraper/__init__.py @@ -1,6 +1,6 @@ """Initialize the module.""" __name__ = "paperscraper" -__version__ = "0.2.7" +__version__ = "0.2.8" import logging import os diff --git a/paperscraper/get_dumps/chemrxiv.py b/paperscraper/get_dumps/chemrxiv.py index 131632d..00feac2 100644 --- a/paperscraper/get_dumps/chemrxiv.py +++ b/paperscraper/get_dumps/chemrxiv.py @@ -28,11 +28,11 @@ def chemrxiv( stored in jsonl format in save_path. Args: - begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD. + begin_date (Optional[str]): begin date expressed as YYYY-MM-DD. Defaults to None. - end_date (Optional[str], optional): end date expressed as YYYY-MM-DD. + end_date (Optional[str]): end date expressed as YYYY-MM-DD. Defaults to None. - save_path (str, optional): Path where the dump is stored. + save_path (str): Path where the dump is stored. Defaults to save_path. """ diff --git a/paperscraper/xrxiv/xrxiv_api.py b/paperscraper/xrxiv/xrxiv_api.py index a53074a..2f003a6 100644 --- a/paperscraper/xrxiv/xrxiv_api.py +++ b/paperscraper/xrxiv/xrxiv_api.py @@ -1,10 +1,14 @@ """API for bioRxiv and medRXiv.""" +import logging from datetime import datetime +from time import sleep from typing import Generator, List, Optional import requests +from requests.exceptions import ConnectionError, Timeout launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"} +logger = logging.getLogger(__name__) class XRXivApi: @@ -38,15 +42,17 @@ def get_papers( begin_date: Optional[str] = None, end_date: Optional[str] = None, fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"], + max_retries: int = 10, ) -> Generator: """ Get paper metadata. Args: - begin_date (Optional[str], optional): begin date. Defaults to None, a.k.a. launch date. - end_date (Optional[str], optional): end date. Defaults to None, a.k.a. today. + begin_date (Optional[str]): begin date. Defaults to None, a.k.a. launch date. + end_date (Optional[str]): end date. Defaults to None, a.k.a. today. fields (List[str], optional): fields to return per paper. Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal']. + max_retries (int): Number of retries on connection failure. Defaults to 10. Yields: Generator: a generator of paper metadata (dict) with the desired fields. @@ -68,20 +74,41 @@ def get_papers( do_loop = True cursor = 0 while do_loop: - json_response = requests.get( - self.get_papers_url.format( - begin_date=begin_date, end_date=end_date, cursor=cursor - ) - ).json() - do_loop = json_response["messages"][0]["status"] == "ok" - if do_loop: - cursor += json_response["messages"][0]["count"] - for paper in json_response["collection"]: - processed_paper = { - field: paper.get(field, "") for field in fields - } - yield processed_paper + papers = [] + for attempt in range(max_retries): + try: + json_response = requests.get( + self.get_papers_url.format( + begin_date=begin_date, end_date=end_date, cursor=cursor + ) + ).json() + do_loop = json_response["messages"][0]["status"] == "ok" + if do_loop: + cursor += json_response["messages"][0]["count"] + for paper in json_response["collection"]: + processed_paper = { + field: paper.get(field, "") for field in fields + } + papers.append(processed_paper) + + if do_loop: + yield from papers + break + except (ConnectionError, Timeout) as e: + logger.error( + f"Connection error: {e}. Retrying ({attempt + 1}/{max_retries})" + ) + sleep(5) + continue + except Exception as exc: + logger.exception(f"Failed getting papers: {exc}") + raise RuntimeError( + "Failed getting papers: {} - {}".format( + exc.__class__.__name__, exc + ) + ) except Exception as exc: + logger.exception(f"Failed getting papers: {exc}") raise RuntimeError( "Failed getting papers: {} - {}".format(exc.__class__.__name__, exc) )