From 51b95d92991eb10a14df47a7ae31990a2bc8c5dd Mon Sep 17 00:00:00 2001
From: Jonathan Green <jonathan@razzard.com>
Date: Fri, 8 Mar 2024 17:05:17 -0400
Subject: [PATCH] Add feed download command (#21)

* Add feed download commands

* Update README
---
 README.md                             |   7 +
 poetry.lock                           |  24 ++-
 pyproject.toml                        |   3 +
 src/palace_tools/cli/download_feed.py | 110 +++++++++++++
 src/palace_tools/feeds/__init__.py    |   0
 src/palace_tools/feeds/axis.py        |  43 +++++
 src/palace_tools/feeds/opds.py        |  64 ++++++++
 src/palace_tools/feeds/overdrive.py   | 220 ++++++++++++++++++++++++++
 8 files changed, 470 insertions(+), 1 deletion(-)
 create mode 100644 src/palace_tools/cli/download_feed.py
 create mode 100644 src/palace_tools/feeds/__init__.py
 create mode 100644 src/palace_tools/feeds/axis.py
 create mode 100644 src/palace_tools/feeds/opds.py
 create mode 100644 src/palace_tools/feeds/overdrive.py

diff --git a/README.md b/README.md
index 85f69d9..e575667 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,13 @@ manifest conforming to the [Audiobook Profile](https://github.com/readium/webpub
       a local directory containing audiobook manifests and their associated media files.
     - Note: This application uses `python-vlc` which requires VLC to be installed on
       the system. The VLC installation can be found [here](https://www.videolan.org/vlc/).
+- `download-feed` - Download various feeds for local inspection.
+    - `opds2`
+        - Download an OPDS2 / OPDS2 + ODL feed.
+    - `overdrive`
+        - Download Overdrive feeds.
+    - `axis`
+        - Download B&T Axis 360 availability feed.
 
 ### Library Support
 
diff --git a/poetry.lock b/poetry.lock
index 26b7f29..ede04d7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -708,6 +708,17 @@ files = [
     {file = "types_pytz-2024.1.0.20240203-py3-none-any.whl", hash = "sha256:9679eef0365db3af91ef7722c199dbb75ee5c1b67e3c4dd7bfbeb1b8a71c21a3"},
 ]
 
+[[package]]
+name = "types-xmltodict"
+version = "0.13.0.3"
+description = "Typing stubs for xmltodict"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-xmltodict-0.13.0.3.tar.gz", hash = "sha256:8884534bab0364c4b22d5973f3c8153ff40d413a801d9e70eb893e676909f1fc"},
+    {file = "types_xmltodict-0.13.0.3-py3-none-any.whl", hash = "sha256:cb251c59e838986d8402b10d804225ade9fd6c9f66d01dc45cd6cfdf43640128"},
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.9.0"
@@ -753,7 +764,18 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
+[[package]]
+name = "xmltodict"
+version = "0.13.0"
+description = "Makes working with XML feel like you are working with JSON"
+optional = false
+python-versions = ">=3.4"
+files = [
+    {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
+    {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
+]
+
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4"
-content-hash = "67e54f28f347ed760ac2fb696cbbf44590916415290e0e4c80cd504ebd526733"
+content-hash = "9efac03c52aff2490dfaf3e3f2d8d9f638981e1b5889faac3eddbcc3d83078fc"
diff --git a/pyproject.toml b/pyproject.toml
index 0449e70..8d4600c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ rich = "^13.7.1"
 textual = "^0.52.1"
 typer = "^0.9.0"
 typing_extensions = {version = "^4.9.0", python = "<3.11"}
+xmltodict = "^0.13.0"
 
 [tool.poetry.group.ci.dependencies]
 pre-commit = "^3.6.1"
@@ -44,9 +45,11 @@ pre-commit = "^3.6.1"
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.8.0"
 types-pytz = "^2024.1.0.20240203"
+types-xmltodict = "^0.13.0.3"
 
 [tool.poetry.scripts]
 audiobook-manifest-summary = "palace_tools.cli.summarize_rwpm_audio_manifest:main"
+download-feed = "palace_tools.cli.download_feed:main"
 fetch-lcp = "palace_tools.cli.fetch_lcp:main"
 palace-terminal = "palace_tools.cli.palace_terminal:main"
 patron-bookshelf = "palace_tools.cli.patron_bookshelf:main"
diff --git a/src/palace_tools/cli/download_feed.py b/src/palace_tools/cli/download_feed.py
new file mode 100644
index 0000000..4cdc185
--- /dev/null
+++ b/src/palace_tools/cli/download_feed.py
@@ -0,0 +1,110 @@
+import asyncio
+import json
+from pathlib import Path
+from xml.dom import minidom
+
+import typer
+import xmltodict
+
+from palace_tools.feeds import axis, opds, overdrive
+from palace_tools.feeds.opds import write_json
+from palace_tools.utils.typer import run_typer_app_as_main
+
+app = typer.Typer()
+
+
+@app.command("axis")
+def download_axis(
+    username: str = typer.Option(..., "--username", "-u", help="Username"),
+    password: str = typer.Option(..., "--password", "-p", help="Password"),
+    library_id: str = typer.Option(..., "-l", "--library-id", help="Library ID"),
+    output_json: bool = typer.Option(False, "-j", "--json", help="Output JSON file"),
+    qa_endpoint: bool = typer.Option(False, "-q", "--qa", help="Use QA Endpoint"),
+    output_file: Path = typer.Argument(
+        ..., help="Output file", writable=True, file_okay=True, dir_okay=False
+    ),
+) -> None:
+    """Download B&T Axis 360 feed."""
+
+    # Find the base URL to use
+    base_url = axis.PRODUCTION_BASE_URL if not qa_endpoint else axis.QA_BASE_URL
+
+    # Fetch the document as XML
+    xml = axis.availability(base_url, username, password, library_id)
+
+    with output_file.open("w") as file:
+        if output_json:
+            xml_dict = xmltodict.parse(xml)
+            file.write(json.dumps(xml_dict, indent=4))
+        else:
+            parsed = minidom.parseString(xml)
+            file.write(parsed.toprettyxml())
+
+
+@app.command("overdrive")
+def download_overdrive(
+    client_key: str = typer.Option(..., "-k", "--client-key", help="Client Key"),
+    client_secret: str = typer.Option(
+        ..., "-s", "--client-secret", help="Client Secret"
+    ),
+    library_id: str = typer.Option(..., "-l", "--library-id", help="Library ID"),
+    parent_library_id: str = typer.Option(
+        None,
+        "-p",
+        "--parent-library-id",
+        help="Parent Library ID (for Advantage Accounts)",
+    ),
+    fetch_metadata: bool = typer.Option(
+        False, "-m", "--metadata", help="Fetch metadata"
+    ),
+    fetch_availability: bool = typer.Option(
+        False, "-a", "--availability", help="Fetch availability"
+    ),
+    qa_endpoint: bool = typer.Option(False, "-q", "--qa", help="Use QA Endpoint"),
+    connections: int = typer.Option(
+        20, "-c", "--connections", help="Number of connections to use"
+    ),
+    output_file: Path = typer.Argument(
+        ..., help="Output file", writable=True, file_okay=True, dir_okay=False
+    ),
+) -> None:
+    """Download Overdrive feed."""
+    base_url = overdrive.QA_BASE_URL if qa_endpoint else overdrive.PROD_BASE_URL
+    products = asyncio.run(
+        overdrive.fetch(
+            base_url,
+            client_key,
+            client_secret,
+            library_id,
+            parent_library_id,
+            fetch_metadata,
+            fetch_availability,
+            connections,
+        )
+    )
+
+    with output_file.open("w") as file:
+        file.write(json.dumps(products, indent=4))
+
+
+@app.command("opds2")
+def download_opds(
+    username: str = typer.Option(None, "--username", "-u", help="Username"),
+    password: str = typer.Option(None, "--password", "-p", help="Password"),
+    url: str = typer.Argument(..., help="URL of feed", metavar="URL"),
+    output_file: Path = typer.Argument(
+        ..., help="Output file", writable=True, file_okay=True, dir_okay=False
+    ),
+) -> None:
+    """Download OPDS 2 feed."""
+    publications = opds.fetch(url, username, password)
+    with output_file.open("w") as file:
+        write_json(file, publications)
+
+
+def main() -> None:
+    run_typer_app_as_main(app, prog_name="download-feed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/palace_tools/feeds/__init__.py b/src/palace_tools/feeds/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/palace_tools/feeds/axis.py b/src/palace_tools/feeds/axis.py
new file mode 100644
index 0000000..e967941
--- /dev/null
+++ b/src/palace_tools/feeds/axis.py
@@ -0,0 +1,43 @@
+import base64
+import json
+import sys
+
+import httpx
+
+PRODUCTION_BASE_URL = "https://axis360api.baker-taylor.com/Services/VendorAPI/"
+QA_BASE_URL = "https://axis360apiqa.baker-taylor.com/Services/VendorAPI/"
+
+access_token_endpoint = "accesstoken"
+availability_endpoint = "availability/v2"
+
+
+def get_headers(
+    base_url: str, username: str, password: str, library_id: str
+) -> dict[str, str]:
+    authorization_str = ":".join([username, password, library_id])
+    authorization_bytes = authorization_str.encode("utf_16_le")
+    authorization_b64 = base64.standard_b64encode(authorization_bytes)
+    resp = httpx.post(
+        base_url + access_token_endpoint,
+        headers={"Authorization": f"Basic {authorization_b64.decode('utf-8')}"},
+    )
+    if resp.status_code != 200:
+        print(f"Error: {resp.status_code}")
+        print(f"Headers: {json.dumps(dict(resp.headers), indent=4)}")
+        print(resp.text)
+        sys.exit(-1)
+    return {
+        "Authorization": "Bearer " + resp.json()["access_token"],
+        "Library": library_id,
+    }
+
+
+def availability(base_url: str, username: str, password: str, library_id: str) -> str:
+    headers = get_headers(base_url, username, password, library_id)
+    resp = httpx.get(
+        base_url + availability_endpoint,
+        headers=headers,
+        params={"updatedDate": "1970-01-01 00:00:00"},
+        timeout=30.0,
+    )
+    return resp.text
diff --git a/src/palace_tools/feeds/opds.py b/src/palace_tools/feeds/opds.py
new file mode 100644
index 0000000..7d973f6
--- /dev/null
+++ b/src/palace_tools/feeds/opds.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import json
+import math
+import sys
+from typing import Any, TextIO
+
+import httpx
+from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn
+
+
+def make_request(session: httpx.Client, url: str) -> dict[str, Any]:
+    response = session.get(url)
+    if response.status_code != 200:
+        print(f"Error: {response.status_code}")
+        print(f"Headers: {json.dumps(dict(response.headers), indent=4)}")
+        print(response.text)
+        sys.exit(-1)
+    return response.json()  # type: ignore[no-any-return]
+
+
+def write_json(file: TextIO, data: list[dict[str, Any]]) -> None:
+    file.write(json.dumps(data, indent=4))
+
+
+def fetch(url: str, username: str | None, password: str | None) -> list[dict[str, Any]]:
+    # Create a session to fetch the documents
+    client = httpx.Client()
+
+    client.headers.update({"Accept": "application/opds+json", "User-Agent": "Palace"})
+    client.timeout = httpx.Timeout(30.0)
+
+    if username and password:
+        client.auth = httpx.BasicAuth(username, password)
+
+    publications = []
+
+    # Get the first page
+    response = make_request(client, url)
+    items = response.get("metadata", {}).get("numberOfItems")
+    items_per_page = response.get("metadata", {}).get("itemsPerPage")
+
+    if items is None or items_per_page is None:
+        pages = None
+    else:
+        pages = math.ceil(items / items_per_page)
+
+    # Fetch the rest of the pages:
+    next_url: str | None = url
+    with Progress(
+        SpinnerColumn(), *Progress.get_default_columns(), MofNCompleteColumn()
+    ) as progress:
+        download_task = progress.add_task(f"Downloading Feed", total=pages)
+        while next_url is not None:
+            response = make_request(client, next_url)
+            publications.extend(response["publications"])
+            next_url = None
+            for link in response["links"]:
+                if link["rel"] == "next":
+                    next_url = link["href"]
+                    break
+            progress.update(download_task, advance=1)
+
+    return publications
diff --git a/src/palace_tools/feeds/overdrive.py b/src/palace_tools/feeds/overdrive.py
new file mode 100644
index 0000000..a33070c
--- /dev/null
+++ b/src/palace_tools/feeds/overdrive.py
@@ -0,0 +1,220 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import math
+import sys
+from collections import defaultdict, deque
+from typing import Any
+
+import httpx
+from httpx import URL, HTTPStatusError, Limits, RequestError, Response, Timeout
+from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn
+
+QA_BASE_URL = "https://integration.api.overdrive.com"
+PROD_BASE_URL = "https://api.overdrive.com"
+
+TOKEN_ENDPOINT = "https://oauth.overdrive.com/token"
+EVENTS_ENDPOINT = "/v1/collections/%(collection_token)s/products"
+LIBRARY_ENDPOINT = "/v1/libraries/%(library_id)s"
+ADVANTAGE_LIBRARY_ENDPOINT = (
+    "/v1/libraries/%(parent_library_id)s/advantageAccounts/%(library_id)s"
+)
+
+
+def handle_error(resp: Response) -> None:
+    if resp.status_code == 200:
+        return
+    print(f"URL: {resp.url}")
+    print(f"Error: {resp.status_code}")
+    print(f"Headers: {json.dumps(dict(resp.headers), indent=4)}")
+    print(resp.text)
+    sys.exit(-1)
+
+
+async def get_auth_token(
+    http: httpx.AsyncClient, client_key: str, client_secret: str
+) -> str:
+    auth = (client_key, client_secret)
+    resp = await http.post(
+        TOKEN_ENDPOINT, auth=auth, data=dict(grant_type="client_credentials")
+    )
+    handle_error(resp)
+    return resp.json()["access_token"]  # type: ignore[no-any-return]
+
+
+def get_headers(auth_token: str) -> dict[str, str]:
+    return {"Authorization": "Bearer " + auth_token, "User-Agent": "Palace"}
+
+
+async def get_collection_token(
+    http: httpx.AsyncClient, library_id: str, parent_library_id: str | None
+) -> str:
+    variables = {
+        "parent_library_id": parent_library_id,
+        "library_id": library_id,
+    }
+
+    endpoint = ADVANTAGE_LIBRARY_ENDPOINT if parent_library_id else LIBRARY_ENDPOINT
+
+    resp = await http.get(endpoint % variables)
+    handle_error(resp)
+    return resp.json()["collectionToken"]  # type: ignore[no-any-return]
+
+
+def event_url(
+    collection_token: str,
+    sort: str = "popularity:desc",
+    limit: int = 200,
+    offset: int | None = None,
+) -> str:
+    url = EVENTS_ENDPOINT % {"collection_token": collection_token}
+    params = {"sort": sort, "limit": limit}
+    if offset is not None:
+        params["offset"] = offset
+
+    return url + "?" + "&".join(f"{k}={v}" for k, v in params.items())
+
+
+def make_request(
+    client: httpx.AsyncClient,
+    urls: deque[str] | str,
+    pending_requests: list[asyncio.Task[Response]],
+) -> None:
+    if isinstance(urls, str):
+        url = urls
+    else:
+        url = urls.pop()
+    req = client.get(url)
+    task = asyncio.create_task(req)
+    pending_requests.append(task)
+
+
+def process_request(
+    response: Response,
+    request_metadata: bool,
+    request_availability: bool,
+    base_url: str,
+    events_path: str,
+    products: dict[str, Any],
+    urls: deque[str],
+) -> None:
+    data = response.raise_for_status().json()
+    path = response.url.path
+    if path == events_path:
+        response_products = data["products"]
+        for product in response_products:
+            if request_metadata:
+                urls.append(product["links"]["metadata"]["href"].removeprefix(base_url))
+            if request_availability:
+                urls.append(
+                    product["links"]["availability"]["href"].removeprefix(base_url)
+                )
+                urls.append(
+                    product["links"]["availabilityV2"]["href"].removeprefix(base_url)
+                )
+            products[product["id"].lower()] = product
+    elif path.endswith("availability") and path.startswith("/v1/"):
+        products[data["id"].lower()]["availability"] = data
+    elif path.endswith("availability") and path.startswith("/v2/"):
+        products[data["reserveId"].lower()]["availabilityV2"] = data
+    elif path.endswith("metadata") and path.startswith("/v1/"):
+        products[data["id"].lower()]["metadata"] = data
+    else:
+        raise RuntimeError(f"Unknown URL: {response.url}")
+
+
+async def fetch(
+    base_url: str,
+    client_key: str,
+    client_secret: str,
+    library_id: str,
+    parent_library_id: str | None,
+    fetch_metadata: bool,
+    fetch_availability: bool,
+    connections: int,
+) -> list[dict[str, Any]]:
+    async with httpx.AsyncClient(
+        timeout=Timeout(20.0, pool=None),
+        limits=Limits(
+            max_connections=connections,
+            max_keepalive_connections=connections,
+            keepalive_expiry=5,
+        ),
+    ) as client:
+        auth_token = await get_auth_token(client, client_key, client_secret)
+
+        client.headers.update(get_headers(auth_token))
+        client.base_url = URL(base_url)
+
+        collection_token = await get_collection_token(
+            client, library_id, parent_library_id
+        )
+
+        first_page = await client.get(event_url(collection_token))
+        handle_error(first_page)
+        first_page_data = first_page.json()
+
+        items = first_page_data["totalItems"]
+        items_per_page = first_page_data["limit"]
+        pages = math.ceil(items / items_per_page)
+
+        fetches = (
+            pages
+            + (items if fetch_metadata else 0)
+            + (items * 2 if fetch_availability else 0)
+        )
+        with Progress(
+            SpinnerColumn(), *Progress.get_default_columns(), MofNCompleteColumn()
+        ) as progress:
+            download_task = progress.add_task(f"Downloading Feed", total=fetches)
+            urls: deque[str] = deque()
+            pending_requests: list[asyncio.Task[Response]] = []
+            products: dict[str, Any] = {}
+            retried_requests: defaultdict[str, int] = defaultdict(int)
+
+            for i in range(pages):
+                urls.append(event_url(collection_token, offset=i * items_per_page))
+
+            for i in range(min(connections * 2, len(urls))):
+                make_request(client, urls, pending_requests)
+
+            while pending_requests:
+                done, pending = await asyncio.wait(
+                    pending_requests, return_when=asyncio.FIRST_COMPLETED
+                )
+
+                pending_requests = list(pending)
+                events_path = EVENTS_ENDPOINT % {"collection_token": collection_token}
+
+                for req in done:
+                    try:
+                        response = await req
+                        process_request(
+                            response,
+                            fetch_metadata,
+                            fetch_availability,
+                            base_url,
+                            events_path,
+                            products,
+                            urls,
+                        )
+                        progress.update(download_task, advance=1)
+                    except (RequestError, HTTPStatusError) as e:
+                        print(f"Request error: {e}")
+                        print(f"URL: {e.request.url}")
+                        request_url = str(e.request.url)
+                        retried_requests[request_url] += 1
+
+                        if retried_requests[request_url] > 3:
+                            print("Too many retries. Exiting.")
+                            sys.exit(-1)
+                        else:
+                            print(
+                                f"Retrying request (attempt {retried_requests[request_url]}/3)"
+                            )
+                            urls.appendleft(request_url)
+                    if urls:
+                        make_request(client, urls, pending_requests)
+
+    return list(products.values())