diff --git a/Makefile b/Makefile index 72dca39..f26bbbb 100644 --- a/Makefile +++ b/Makefile @@ -19,13 +19,7 @@ cov: ## run pytest coverage report poetry run pytest --cov=graver --cov-report term-missing coveralls: ## report coverage data to coveralls.io - . $(VENV)/bin/activate && coveralls - -test-unit: ## run pytest unit tests only - poetry run pytest -rA -vvs --log-level INFO --without-integration - -test-integration: ## run pytest integration tests - poetry run pytest -rA -vvs --log-level INFO --with-integration + poetry run coveralls test: ## run pytest poetry run pytest -rA -vvs --log-level INFO @@ -34,10 +28,8 @@ lint: ## run flake8 to check the code poetry run flake8 $(PACKAGES) tests --count --select=E9,F63,F7,F82 --show-source --statistics poetry run flake8 $(PACKAGES) tests --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics -# install-editable: -# . $(VENV)/bin/activate && pip install -e . install: - . $(VENV)/bin/activate && poetry install + poetry install fmt: ## run black to format the code poetry run isort $(PACKAGES) tests @@ -46,6 +38,8 @@ fmt: ## run black to format the code $(VENV)/init: ## init the virtual environment python3 -m venv $(VENV) touch $@ + $(VENV)/bin/activate && pip install -U pip + $(VENV)/bin/activate && pip install poetry $(VENV)/requirements: requirements.txt $(VENV)/init ## install requirements $(PIP) install -r $< diff --git a/poetry.lock b/poetry.lock index 7de03d2..0bacc13 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1001,6 +1001,25 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-mock" +version = "1.11.0" +description = "Mock out responses from the requests package" +optional = false +python-versions = "*" +files = [ + {file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"}, + {file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"}, +] + +[package.dependencies] +requests = ">=2.3,<3" +six = "*" + +[package.extras] +fixture = ["fixtures"] +test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] + [[package]] name = "setuptools" version = "69.0.2" @@ -1017,6 +1036,17 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "soupsieve" version = "2.5" @@ -1416,4 +1446,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "ea5b964571ec2e50ebe465f250690f72b3aa5596220bb930f64a893f86cefa4d" +content-hash = "a09a738e27e3a210a0e7d7640a50985cdbb1d6a8a8aad1b4f0e1c4590fb06ecc" diff --git a/pyproject.toml b/pyproject.toml index 829e31e..ad23e8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ pytest-cov = "^4.1.0" pytest-helpers-namespace = "^2021.12.29" pytest-integration = "^0.2.3" vcrpy = "^5.1.0" +requests-mock = "^1.11.0" [tool.poetry.group.dev.dependencies] diff --git a/src/graver/api.py b/src/graver/api.py index 013a624..aef322b 100644 --- a/src/graver/api.py +++ b/src/graver/api.py @@ -8,7 +8,7 @@ from dataclasses import asdict, dataclass from re import Match from time import sleep -from typing import List, Optional, cast +from typing import Dict, List, Optional, cast from urllib.parse import parse_qsl, urlparse, urlunparse import requests @@ -44,9 +44,16 @@ class NotFound(MemorialException): class Driver(object): - recoverable: List[int] = [500, 502, 503, 504, 599] + recoverable_errors: Dict[int, str] = { + 500: "Internal Server Error", + 502: "Bad Gateway", + 503: "Service Unavailable", + 504: "Gateway Timeout", + 599: "Network Connect Timeout Error", + } def __init__(self, **kwargs) -> None: + self.num_retries = 0 self.max_retries: int = int(kwargs.get("max_retries", 3)) self.retry_ms: int = int(kwargs.get("retry_ms", 500)) self.session = requests.Session() @@ -55,7 +62,10 @@ def __init__(self, **kwargs) -> None: def get(self, findagrave_url: str, **kwargs) -> Response: retries = 0 response = self.session.get(findagrave_url, **kwargs) - while response.status_code in Driver.recoverable and retries < self.max_retries: + while ( + response.status_code in Driver.recoverable_errors.keys() + and retries < self.max_retries + ): retries += 1 log.warning( f"Driver: [{response.status_code}: {response.reason}] {findagrave_url} " @@ -64,6 +74,7 @@ def get(self, findagrave_url: str, **kwargs) -> Response: ) sleep(self.retry_ms / 1000) response = self.session.get(findagrave_url, **kwargs) + self.num_retries += retries return response @@ -90,13 +101,8 @@ def __init__(self, findagrave_url: str, **kwargs) -> None: self.driver = kwargs.get("driver", Driver()) self.get = kwargs.get("get", True) self.scrape = kwargs.get("scrape", True) - # self.search_url: Optional[str] = None - # self.soup: BeautifulSoup self.params: dict = {} - if "page" in kwargs: - self.params["page"] = kwargs.get("page") - if self.get: response = self.driver.get(findagrave_url, params=self.params) self.soup = BeautifulSoup(response.content, "html.parser") @@ -379,22 +385,25 @@ def __init__(self, findagrave_url: str, **kwargs) -> None: response = self.driver.get(self.findagrave_url) self.soup = BeautifulSoup(response.content, "html.parser") - if not response.ok: - if self.check_removed(): - msg = f"{self.findagrave_url} has been removed" - raise MemorialRemovedException(msg) - elif (new_url := self.check_merged()) is not None: - msg = f"{self.findagrave_url} has been merged into {new_url}" - raise MemorialMergedException(msg, self.findagrave_url, new_url) - else: - response.raise_for_status() - else: + if response.ok: self.scrape_canonical_url() # Valid URL but not a Memorial if "/memorial/" not in self.findagrave_url: raise MemorialException( f"Invalid memorial URL: {self.findagrave_url}" ) + else: + if response.status_code == 404: + if self.check_removed(): + msg = f"{self.findagrave_url} has been removed" + raise MemorialRemovedException(msg) + elif (new_url := self.check_merged()) is not None: + msg = f"{self.findagrave_url} has been merged into {new_url}" + raise MemorialMergedException(msg, self.findagrave_url, new_url) + else: + response.raise_for_status() + else: + response.raise_for_status() if self.scrape: self.scrape_page() diff --git a/tests/test_api.py b/tests/test_api.py index ba555c3..c3b5666 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -7,7 +7,13 @@ from requests import HTTPError import graver.api -from graver import Cemetery, Memorial, MemorialMergedException, MemorialRemovedException +from graver import ( + Cemetery, + Driver, + Memorial, + MemorialMergedException, + MemorialRemovedException, +) logging.basicConfig() vcr_log = logging.getLogger("vcr") @@ -45,23 +51,37 @@ def get_cassette(name: str): ] -# @pytest.fixture() -# def memorials(): -# m_list = [] -# path = f"{PROJECT_ROOT}/tests/fixtures/memorials/*.json" -# file_list = glob.glob(path, recursive=False) -# for file in file_list: -# with open(file) as f: -# m_list.append(json.load(f)) -# return m_list - - def test_vcr(): with vcr.use_cassette(pytest.vcr_cassettes + "synopsis.yaml"): response = requests.get("http://www.iana.org/domains/reserved") assert b"Example domains" in response.content +@pytest.mark.parametrize("url", ["https://www.findagrave.com/memorial/544"]) +@pytest.mark.parametrize( + "status_code, reason", + [ + (500, "Internal Server Error"), + (502, "Bad Gateway"), + (503, "Service Unavailable"), + (504, "Gateway Timeout"), + (599, "Network Connect Timeout Error"), + ], +) +def test_driver_retries_recoverable_errors(url, status_code, reason, requests_mock): + requests_mock.get( + url, + [ + {"status_code": status_code, "reason": reason}, + {"status_code": 200, "reason": "None"}, + ], + ) + driver = Driver(retry_ms=10, max_retries=1) + response = driver.get(url) + assert response.ok and response.status_code == 200 + assert driver.num_retries == 1 + + @pytest.mark.parametrize( "url, cassette", [