Skip to content

Commit

Permalink
Retrieve list of page IDs and root of the tree from API, and introduc…
Browse files Browse the repository at this point in the history
…e caching
  • Loading branch information
benoit74 committed Sep 30, 2024
1 parent 49e1090 commit 9f94f09
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 19 deletions.
140 changes: 134 additions & 6 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import datetime
import json
import re
from collections.abc import Callable
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
Expand Down Expand Up @@ -57,49 +60,154 @@ def placeholders(
class LibreTextsClient:
"""Utility functions to read data from libretexts."""

def __init__(self, library_slug: str) -> None:
def __init__(self, library_slug: str, cache_folder: Path) -> None:
"""Initializes LibreTextsClient.
Paremters:
library_url: Scheme, hostname, and port for the Libretext library
e.g. `https://geo.libretexts.org/`.
"""
self.library_slug = library_slug
self.deki_token = None
self.cache_folder = cache_folder

@property
def library_url(self) -> str:
return f"https://{self.library_slug}.libretexts.org/"
return f"https://{self.library_slug}.libretexts.org"

def _get_text(self, url: str) -> str:
@property
def api_url(self) -> str:
return f"{self.library_url}/@api/deki"

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
if url_subpath_and_query.startswith("/"):
url_subpath_and_query = url_subpath_and_query[1:]
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query

def _get_text(self, url_subpath_and_query: str) -> str:
"""Perform a GET request and return the response as decoded text."""

logger.debug(f"Fetching {url}")
cache_file = self._get_cache_file(f"text{url_subpath_and_query}")
if cache_file.exists():
return cache_file.read_text()
cache_file.parent.mkdir(parents=True, exist_ok=True)

full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

resp = requests.get(
url=url,
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
)
resp.raise_for_status()

cache_file.write_text(resp.text)
return resp.text

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
)
resp.raise_for_status()
return resp

def _get_api_json(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}")
if cache_file.exists():
return json.loads(cache_file.read_text())
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(
f"{api_sub_path}?dream.out.format=json", timeout=timeout
)
result = resp.json()
cache_file.write_text(json.dumps(result))
return result

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")
if cache_file.exists():
return json.loads(cache_file.read_text())
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(api_sub_path, timeout=timeout)
result = resp.content
cache_file.write_bytes(result)
return result

def get_home(self) -> LibreTextsHome:
home_content = self._get_text(self.library_url)
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
shelves=[],
)

def get_deki_token(self) -> str:
"""Retrieves the API token to use to query the website API"""
if self.deki_token:
return self.deki_token

home_content = self._get_text("/")

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

def get_all_pages_ids(self):
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)

page_ids: list[str] = []

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])
if not page_node["subpages"]:
return
if "@id" in page_node["subpages"]["page"]:
_get_page_ids(page_node["subpages"]["page"])
else:
for page in page_node["subpages"]["page"]:
_get_page_ids(page)

_get_page_ids(tree["page"])

return page_ids

def get_root_page_id(self) -> str:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
return tree["page"]["@id"]


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "html.parser")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
if not branding_div:
raise LibreTextsParsingError("<div> with class 'LTBranding' not found")
Expand All @@ -119,6 +227,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:


def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
"""Returns the text found on home page"""
content_section = soup.find("section", class_="mt-content-container")
if not content_section or isinstance(content_section, NavigableString):
raise LibreTextsParsingError(
Expand All @@ -133,3 +242,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
if paragraph_text := paragraph.text:
welcome_text.append(paragraph_text)
return welcome_text


def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
global_settings = soup.find("script", id="mt-global-settings")
if not global_settings:
logger.debug("home content:")
logger.debug(soup)
raise Exception(
"Failed to retrieve API token to query website API, missing "
"mt-global-settings script"
)
x_deki_token = json.loads(global_settings.text).get("apiToken", None)
if not x_deki_token:
logger.debug("mt-global-settings script content:")
logger.debug(global_settings.text)
raise Exception(
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token
36 changes: 33 additions & 3 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import argparse
import logging
import os
from pathlib import Path

from zimscraperlib.zim.filesystem import validate_zimfile_creatable

from libretexts2zim.client import LibreTextsClient
from libretexts2zim.constants import (
Expand Down Expand Up @@ -46,11 +49,18 @@ def main() -> None:

parser.add_argument(
"--output",
help="Output folder for ZIMs. Default: /output",
default="/output",
help="Output folder for ZIMs. Default: output",
default="output",
dest="output_folder",
)

parser.add_argument(
"--tmp",
help="Temporary folder for cache, intermediate files, ... Default: tmp",
default="tmp",
dest="tmp_folder",
)

parser.add_argument(
"--zimui-dist",
type=str,
Expand Down Expand Up @@ -84,21 +94,41 @@ def main() -> None:
required=True,
)

parser.add_argument(
"--keep-cache",
help="Keep cache of website responses",
action="store_true",
default=False,
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)

output_folder = Path(args.output_folder)
output_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(output_folder, "test.txt")

tmp_folder = Path(args.tmp_folder)
tmp_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(tmp_folder, "test.txt")

try:
zim_config = ZimConfig.of(args)
doc_filter = ContentFilter.of(args)

cache_folder = tmp_folder / "cache"
cache_folder.mkdir()

libretexts_client = LibreTextsClient(
library_slug=args.library_slug,
cache_folder=cache_folder,
)

Generator(
libretexts_client=libretexts_client,
zim_config=zim_config,
output_folder=args.output_folder,
output_folder=output_folder,
zimui_dist=args.zimui_dist,
content_filter=doc_filter,
overwrite_existing_zim=args.overwrite,
Expand Down
20 changes: 12 additions & 8 deletions scraper/src/libretexts2zim/generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import datetime
import os
import re
from io import BytesIO
from pathlib import Path
Expand All @@ -11,6 +10,7 @@
)
from zimscraperlib.image import resize_image
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
Expand Down Expand Up @@ -107,7 +107,7 @@ def __init__(
libretexts_client: LibreTextsClient,
zim_config: ZimConfig,
content_filter: ContentFilter,
output_folder: str,
output_folder: Path,
zimui_dist: str,
*,
overwrite_existing_zim: bool,
Expand All @@ -129,8 +129,6 @@ def __init__(
self.zimui_dist = Path(zimui_dist)
self.overwrite_existing_zim = overwrite_existing_zim

os.makedirs(self.output_folder, exist_ok=True)

self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)
Expand All @@ -157,11 +155,17 @@ def run(self) -> Path:
name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
)
formatted_config = self.zim_config.format(metadata.placeholders())
zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
zim_file_name = f"{formatted_config.file_name_format}.zim"
zim_path = self.output_folder / zim_file_name

if zim_path.exists():
if self.overwrite_existing_zim:
zim_path.unlink()
else:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(f"ZIM file already exists at {zim_path}")

if zim_path.exists() and not self.overwrite_existing_zim:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(f"ZIM file already exists at {zim_path}")
validate_zimfile_creatable(self.output_folder, zim_file_name)

logger.info(f" Writing to: {zim_path}")

Expand Down
11 changes: 11 additions & 0 deletions scraper/tests-integration/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import tempfile
from collections.abc import Generator
from pathlib import Path
from typing import Any

import pytest


Expand All @@ -6,6 +11,12 @@ def libretexts_slug() -> str:
return "geo"


@pytest.fixture(scope="module")
def cache_folder() -> Generator[Path, Any, Any]:
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)


@pytest.fixture(scope="module")
def libretexts_url(libretexts_slug: str) -> str:
return f"https://{libretexts_slug}.libretexts.org"
Expand Down
Loading

0 comments on commit 9f94f09

Please sign in to comment.