From 51b95d92991eb10a14df47a7ae31990a2bc8c5dd Mon Sep 17 00:00:00 2001 From: Jonathan Green Date: Fri, 8 Mar 2024 17:05:17 -0400 Subject: [PATCH] Add feed download command (#21) * Add feed download commands * Update README --- README.md | 7 + poetry.lock | 24 ++- pyproject.toml | 3 + src/palace_tools/cli/download_feed.py | 110 +++++++++++++ src/palace_tools/feeds/__init__.py | 0 src/palace_tools/feeds/axis.py | 43 +++++ src/palace_tools/feeds/opds.py | 64 ++++++++ src/palace_tools/feeds/overdrive.py | 220 ++++++++++++++++++++++++++ 8 files changed, 470 insertions(+), 1 deletion(-) create mode 100644 src/palace_tools/cli/download_feed.py create mode 100644 src/palace_tools/feeds/__init__.py create mode 100644 src/palace_tools/feeds/axis.py create mode 100644 src/palace_tools/feeds/opds.py create mode 100644 src/palace_tools/feeds/overdrive.py diff --git a/README.md b/README.md index 85f69d9..e575667 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,13 @@ manifest conforming to the [Audiobook Profile](https://github.com/readium/webpub a local directory containing audiobook manifests and their associated media files. - Note: This application uses `python-vlc` which requires VLC to be installed on the system. The VLC installation can be found [here](https://www.videolan.org/vlc/). +- `download-feed` - Download various feeds for local inspection. + - `opds2` + - Download an OPDS2 / OPDS2 + ODL feed. + - `overdrive` + - Download Overdrive feeds. + - `axis` + - Download B&T Axis 360 availability feed. ### Library Support diff --git a/poetry.lock b/poetry.lock index 26b7f29..ede04d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -708,6 +708,17 @@ files = [ {file = "types_pytz-2024.1.0.20240203-py3-none-any.whl", hash = "sha256:9679eef0365db3af91ef7722c199dbb75ee5c1b67e3c4dd7bfbeb1b8a71c21a3"}, ] +[[package]] +name = "types-xmltodict" +version = "0.13.0.3" +description = "Typing stubs for xmltodict" +optional = false +python-versions = "*" +files = [ + {file = "types-xmltodict-0.13.0.3.tar.gz", hash = "sha256:8884534bab0364c4b22d5973f3c8153ff40d413a801d9e70eb893e676909f1fc"}, + {file = "types_xmltodict-0.13.0.3-py3-none-any.whl", hash = "sha256:cb251c59e838986d8402b10d804225ade9fd6c9f66d01dc45cd6cfdf43640128"}, +] + [[package]] name = "typing-extensions" version = "4.9.0" @@ -753,7 +764,18 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +[[package]] +name = "xmltodict" +version = "0.13.0" +description = "Makes working with XML feel like you are working with JSON" +optional = false +python-versions = ">=3.4" +files = [ + {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, + {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, +] + [metadata] lock-version = "2.0" python-versions = ">=3.10,<4" -content-hash = "67e54f28f347ed760ac2fb696cbbf44590916415290e0e4c80cd504ebd526733" +content-hash = "9efac03c52aff2490dfaf3e3f2d8d9f638981e1b5889faac3eddbcc3d83078fc" diff --git a/pyproject.toml b/pyproject.toml index 0449e70..8d4600c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ rich = "^13.7.1" textual = "^0.52.1" typer = "^0.9.0" typing_extensions = {version = "^4.9.0", python = "<3.11"} +xmltodict = "^0.13.0" [tool.poetry.group.ci.dependencies] pre-commit = "^3.6.1" @@ -44,9 +45,11 @@ pre-commit = "^3.6.1" [tool.poetry.group.dev.dependencies] mypy = "^1.8.0" types-pytz = "^2024.1.0.20240203" +types-xmltodict = "^0.13.0.3" [tool.poetry.scripts] audiobook-manifest-summary = "palace_tools.cli.summarize_rwpm_audio_manifest:main" +download-feed = "palace_tools.cli.download_feed:main" fetch-lcp = "palace_tools.cli.fetch_lcp:main" palace-terminal = "palace_tools.cli.palace_terminal:main" patron-bookshelf = "palace_tools.cli.patron_bookshelf:main" diff --git a/src/palace_tools/cli/download_feed.py b/src/palace_tools/cli/download_feed.py new file mode 100644 index 0000000..4cdc185 --- /dev/null +++ b/src/palace_tools/cli/download_feed.py @@ -0,0 +1,110 @@ +import asyncio +import json +from pathlib import Path +from xml.dom import minidom + +import typer +import xmltodict + +from palace_tools.feeds import axis, opds, overdrive +from palace_tools.feeds.opds import write_json +from palace_tools.utils.typer import run_typer_app_as_main + +app = typer.Typer() + + +@app.command("axis") +def download_axis( + username: str = typer.Option(..., "--username", "-u", help="Username"), + password: str = typer.Option(..., "--password", "-p", help="Password"), + library_id: str = typer.Option(..., "-l", "--library-id", help="Library ID"), + output_json: bool = typer.Option(False, "-j", "--json", help="Output JSON file"), + qa_endpoint: bool = typer.Option(False, "-q", "--qa", help="Use QA Endpoint"), + output_file: Path = typer.Argument( + ..., help="Output file", writable=True, file_okay=True, dir_okay=False + ), +) -> None: + """Download B&T Axis 360 feed.""" + + # Find the base URL to use + base_url = axis.PRODUCTION_BASE_URL if not qa_endpoint else axis.QA_BASE_URL + + # Fetch the document as XML + xml = axis.availability(base_url, username, password, library_id) + + with output_file.open("w") as file: + if output_json: + xml_dict = xmltodict.parse(xml) + file.write(json.dumps(xml_dict, indent=4)) + else: + parsed = minidom.parseString(xml) + file.write(parsed.toprettyxml()) + + +@app.command("overdrive") +def download_overdrive( + client_key: str = typer.Option(..., "-k", "--client-key", help="Client Key"), + client_secret: str = typer.Option( + ..., "-s", "--client-secret", help="Client Secret" + ), + library_id: str = typer.Option(..., "-l", "--library-id", help="Library ID"), + parent_library_id: str = typer.Option( + None, + "-p", + "--parent-library-id", + help="Parent Library ID (for Advantage Accounts)", + ), + fetch_metadata: bool = typer.Option( + False, "-m", "--metadata", help="Fetch metadata" + ), + fetch_availability: bool = typer.Option( + False, "-a", "--availability", help="Fetch availability" + ), + qa_endpoint: bool = typer.Option(False, "-q", "--qa", help="Use QA Endpoint"), + connections: int = typer.Option( + 20, "-c", "--connections", help="Number of connections to use" + ), + output_file: Path = typer.Argument( + ..., help="Output file", writable=True, file_okay=True, dir_okay=False + ), +) -> None: + """Download Overdrive feed.""" + base_url = overdrive.QA_BASE_URL if qa_endpoint else overdrive.PROD_BASE_URL + products = asyncio.run( + overdrive.fetch( + base_url, + client_key, + client_secret, + library_id, + parent_library_id, + fetch_metadata, + fetch_availability, + connections, + ) + ) + + with output_file.open("w") as file: + file.write(json.dumps(products, indent=4)) + + +@app.command("opds2") +def download_opds( + username: str = typer.Option(None, "--username", "-u", help="Username"), + password: str = typer.Option(None, "--password", "-p", help="Password"), + url: str = typer.Argument(..., help="URL of feed", metavar="URL"), + output_file: Path = typer.Argument( + ..., help="Output file", writable=True, file_okay=True, dir_okay=False + ), +) -> None: + """Download OPDS 2 feed.""" + publications = opds.fetch(url, username, password) + with output_file.open("w") as file: + write_json(file, publications) + + +def main() -> None: + run_typer_app_as_main(app, prog_name="download-feed") + + +if __name__ == "__main__": + main() diff --git a/src/palace_tools/feeds/__init__.py b/src/palace_tools/feeds/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/palace_tools/feeds/axis.py b/src/palace_tools/feeds/axis.py new file mode 100644 index 0000000..e967941 --- /dev/null +++ b/src/palace_tools/feeds/axis.py @@ -0,0 +1,43 @@ +import base64 +import json +import sys + +import httpx + +PRODUCTION_BASE_URL = "https://axis360api.baker-taylor.com/Services/VendorAPI/" +QA_BASE_URL = "https://axis360apiqa.baker-taylor.com/Services/VendorAPI/" + +access_token_endpoint = "accesstoken" +availability_endpoint = "availability/v2" + + +def get_headers( + base_url: str, username: str, password: str, library_id: str +) -> dict[str, str]: + authorization_str = ":".join([username, password, library_id]) + authorization_bytes = authorization_str.encode("utf_16_le") + authorization_b64 = base64.standard_b64encode(authorization_bytes) + resp = httpx.post( + base_url + access_token_endpoint, + headers={"Authorization": f"Basic {authorization_b64.decode('utf-8')}"}, + ) + if resp.status_code != 200: + print(f"Error: {resp.status_code}") + print(f"Headers: {json.dumps(dict(resp.headers), indent=4)}") + print(resp.text) + sys.exit(-1) + return { + "Authorization": "Bearer " + resp.json()["access_token"], + "Library": library_id, + } + + +def availability(base_url: str, username: str, password: str, library_id: str) -> str: + headers = get_headers(base_url, username, password, library_id) + resp = httpx.get( + base_url + availability_endpoint, + headers=headers, + params={"updatedDate": "1970-01-01 00:00:00"}, + timeout=30.0, + ) + return resp.text diff --git a/src/palace_tools/feeds/opds.py b/src/palace_tools/feeds/opds.py new file mode 100644 index 0000000..7d973f6 --- /dev/null +++ b/src/palace_tools/feeds/opds.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import json +import math +import sys +from typing import Any, TextIO + +import httpx +from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn + + +def make_request(session: httpx.Client, url: str) -> dict[str, Any]: + response = session.get(url) + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(f"Headers: {json.dumps(dict(response.headers), indent=4)}") + print(response.text) + sys.exit(-1) + return response.json() # type: ignore[no-any-return] + + +def write_json(file: TextIO, data: list[dict[str, Any]]) -> None: + file.write(json.dumps(data, indent=4)) + + +def fetch(url: str, username: str | None, password: str | None) -> list[dict[str, Any]]: + # Create a session to fetch the documents + client = httpx.Client() + + client.headers.update({"Accept": "application/opds+json", "User-Agent": "Palace"}) + client.timeout = httpx.Timeout(30.0) + + if username and password: + client.auth = httpx.BasicAuth(username, password) + + publications = [] + + # Get the first page + response = make_request(client, url) + items = response.get("metadata", {}).get("numberOfItems") + items_per_page = response.get("metadata", {}).get("itemsPerPage") + + if items is None or items_per_page is None: + pages = None + else: + pages = math.ceil(items / items_per_page) + + # Fetch the rest of the pages: + next_url: str | None = url + with Progress( + SpinnerColumn(), *Progress.get_default_columns(), MofNCompleteColumn() + ) as progress: + download_task = progress.add_task(f"Downloading Feed", total=pages) + while next_url is not None: + response = make_request(client, next_url) + publications.extend(response["publications"]) + next_url = None + for link in response["links"]: + if link["rel"] == "next": + next_url = link["href"] + break + progress.update(download_task, advance=1) + + return publications diff --git a/src/palace_tools/feeds/overdrive.py b/src/palace_tools/feeds/overdrive.py new file mode 100644 index 0000000..a33070c --- /dev/null +++ b/src/palace_tools/feeds/overdrive.py @@ -0,0 +1,220 @@ +from __future__ import annotations + +import asyncio +import json +import math +import sys +from collections import defaultdict, deque +from typing import Any + +import httpx +from httpx import URL, HTTPStatusError, Limits, RequestError, Response, Timeout +from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn + +QA_BASE_URL = "https://integration.api.overdrive.com" +PROD_BASE_URL = "https://api.overdrive.com" + +TOKEN_ENDPOINT = "https://oauth.overdrive.com/token" +EVENTS_ENDPOINT = "/v1/collections/%(collection_token)s/products" +LIBRARY_ENDPOINT = "/v1/libraries/%(library_id)s" +ADVANTAGE_LIBRARY_ENDPOINT = ( + "/v1/libraries/%(parent_library_id)s/advantageAccounts/%(library_id)s" +) + + +def handle_error(resp: Response) -> None: + if resp.status_code == 200: + return + print(f"URL: {resp.url}") + print(f"Error: {resp.status_code}") + print(f"Headers: {json.dumps(dict(resp.headers), indent=4)}") + print(resp.text) + sys.exit(-1) + + +async def get_auth_token( + http: httpx.AsyncClient, client_key: str, client_secret: str +) -> str: + auth = (client_key, client_secret) + resp = await http.post( + TOKEN_ENDPOINT, auth=auth, data=dict(grant_type="client_credentials") + ) + handle_error(resp) + return resp.json()["access_token"] # type: ignore[no-any-return] + + +def get_headers(auth_token: str) -> dict[str, str]: + return {"Authorization": "Bearer " + auth_token, "User-Agent": "Palace"} + + +async def get_collection_token( + http: httpx.AsyncClient, library_id: str, parent_library_id: str | None +) -> str: + variables = { + "parent_library_id": parent_library_id, + "library_id": library_id, + } + + endpoint = ADVANTAGE_LIBRARY_ENDPOINT if parent_library_id else LIBRARY_ENDPOINT + + resp = await http.get(endpoint % variables) + handle_error(resp) + return resp.json()["collectionToken"] # type: ignore[no-any-return] + + +def event_url( + collection_token: str, + sort: str = "popularity:desc", + limit: int = 200, + offset: int | None = None, +) -> str: + url = EVENTS_ENDPOINT % {"collection_token": collection_token} + params = {"sort": sort, "limit": limit} + if offset is not None: + params["offset"] = offset + + return url + "?" + "&".join(f"{k}={v}" for k, v in params.items()) + + +def make_request( + client: httpx.AsyncClient, + urls: deque[str] | str, + pending_requests: list[asyncio.Task[Response]], +) -> None: + if isinstance(urls, str): + url = urls + else: + url = urls.pop() + req = client.get(url) + task = asyncio.create_task(req) + pending_requests.append(task) + + +def process_request( + response: Response, + request_metadata: bool, + request_availability: bool, + base_url: str, + events_path: str, + products: dict[str, Any], + urls: deque[str], +) -> None: + data = response.raise_for_status().json() + path = response.url.path + if path == events_path: + response_products = data["products"] + for product in response_products: + if request_metadata: + urls.append(product["links"]["metadata"]["href"].removeprefix(base_url)) + if request_availability: + urls.append( + product["links"]["availability"]["href"].removeprefix(base_url) + ) + urls.append( + product["links"]["availabilityV2"]["href"].removeprefix(base_url) + ) + products[product["id"].lower()] = product + elif path.endswith("availability") and path.startswith("/v1/"): + products[data["id"].lower()]["availability"] = data + elif path.endswith("availability") and path.startswith("/v2/"): + products[data["reserveId"].lower()]["availabilityV2"] = data + elif path.endswith("metadata") and path.startswith("/v1/"): + products[data["id"].lower()]["metadata"] = data + else: + raise RuntimeError(f"Unknown URL: {response.url}") + + +async def fetch( + base_url: str, + client_key: str, + client_secret: str, + library_id: str, + parent_library_id: str | None, + fetch_metadata: bool, + fetch_availability: bool, + connections: int, +) -> list[dict[str, Any]]: + async with httpx.AsyncClient( + timeout=Timeout(20.0, pool=None), + limits=Limits( + max_connections=connections, + max_keepalive_connections=connections, + keepalive_expiry=5, + ), + ) as client: + auth_token = await get_auth_token(client, client_key, client_secret) + + client.headers.update(get_headers(auth_token)) + client.base_url = URL(base_url) + + collection_token = await get_collection_token( + client, library_id, parent_library_id + ) + + first_page = await client.get(event_url(collection_token)) + handle_error(first_page) + first_page_data = first_page.json() + + items = first_page_data["totalItems"] + items_per_page = first_page_data["limit"] + pages = math.ceil(items / items_per_page) + + fetches = ( + pages + + (items if fetch_metadata else 0) + + (items * 2 if fetch_availability else 0) + ) + with Progress( + SpinnerColumn(), *Progress.get_default_columns(), MofNCompleteColumn() + ) as progress: + download_task = progress.add_task(f"Downloading Feed", total=fetches) + urls: deque[str] = deque() + pending_requests: list[asyncio.Task[Response]] = [] + products: dict[str, Any] = {} + retried_requests: defaultdict[str, int] = defaultdict(int) + + for i in range(pages): + urls.append(event_url(collection_token, offset=i * items_per_page)) + + for i in range(min(connections * 2, len(urls))): + make_request(client, urls, pending_requests) + + while pending_requests: + done, pending = await asyncio.wait( + pending_requests, return_when=asyncio.FIRST_COMPLETED + ) + + pending_requests = list(pending) + events_path = EVENTS_ENDPOINT % {"collection_token": collection_token} + + for req in done: + try: + response = await req + process_request( + response, + fetch_metadata, + fetch_availability, + base_url, + events_path, + products, + urls, + ) + progress.update(download_task, advance=1) + except (RequestError, HTTPStatusError) as e: + print(f"Request error: {e}") + print(f"URL: {e.request.url}") + request_url = str(e.request.url) + retried_requests[request_url] += 1 + + if retried_requests[request_url] > 3: + print("Too many retries. Exiting.") + sys.exit(-1) + else: + print( + f"Retrying request (attempt {retried_requests[request_url]}/3)" + ) + urls.appendleft(request_url) + if urls: + make_request(client, urls, pending_requests) + + return list(products.values())