Skip to content

Commit

Permalink
fixup! Retrieve list of page IDs and root of the tree from API, and i…
Browse files Browse the repository at this point in the history
…ntroduce caching
  • Loading branch information
benoit74 committed Oct 3, 2024
1 parent aca4c7e commit 4b65edf
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 14 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \
# Copy zimui build output
COPY --from=zimui /src/dist /src/zimui

ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui LIBRETEXTS_OUTPUT=/output LIBRETEXTS_TMP=/tmp
ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \
LIBRETEXTS_OUTPUT=/output \
LIBRETEXTS_TMP=/tmp

CMD ["libretexts2zim", "--help"]
18 changes: 9 additions & 9 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

from libretexts2zim.constants import logger

HTTP_TIMEOUT_SECONDS = 15
HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30


class LibreTextsParsingError(Exception):
Expand Down Expand Up @@ -74,8 +75,7 @@ def api_url(self) -> str:

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
if url_subpath_and_query.startswith("/"):
url_subpath_and_query = url_subpath_and_query[1:]
url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)

Check warning on line 78 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L78

Added line #L78 was not covered by tests
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query

Check warning on line 81 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L80-L81

Added lines #L80 - L81 were not covered by tests
Expand All @@ -94,7 +94,7 @@ def _get_text(self, url_subpath_and_query: str) -> str:
resp = requests.get(
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
resp.raise_for_status()

Expand All @@ -115,7 +115,7 @@ def _get_api_resp(
return resp

Check warning on line 115 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L114-L115

Added lines #L114 - L115 were not covered by tests

def _get_api_json(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}")

Check warning on line 120 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L120

Added line #L120 was not covered by tests
if cache_file.exists():
Expand All @@ -129,11 +129,11 @@ def _get_api_json(
return result

Check warning on line 129 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L127-L129

Added lines #L127 - L129 were not covered by tests

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")

Check warning on line 134 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L134

Added line #L134 was not covered by tests
if cache_file.exists():
return json.loads(cache_file.read_text())
return cache_file.read_bytes()
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(api_sub_path, timeout=timeout)
result = resp.content
Expand Down Expand Up @@ -165,7 +165,7 @@ def get_deki_token(self) -> str:
def get_all_pages_ids(self):
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

Check warning on line 168 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L168

Added line #L168 was not covered by tests

page_ids: list[str] = []

Check warning on line 170 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L170

Added line #L170 was not covered by tests

Expand All @@ -186,7 +186,7 @@ def _get_page_ids(page_node: Any) -> None:
def get_root_page_id(self) -> str:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]

Check warning on line 190 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L189-L190

Added lines #L189 - L190 were not covered by tests


Expand Down
9 changes: 5 additions & 4 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)
from libretexts2zim.processor import ContentFilter, Processor
from libretexts2zim.zimconfig import ZimConfig

import tempfile

def zim_defaults() -> ZimConfig:
"""Returns the default configuration for ZIM generation."""
Expand Down Expand Up @@ -138,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
)


def main() -> None:
def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
prog=NAME,
)
Expand Down Expand Up @@ -181,7 +181,7 @@ def main() -> None:
parser.add_argument(

Check warning on line 181 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L181

Added line #L181 was not covered by tests
"--tmp",
help="Temporary folder for cache, intermediate files, ... Default: tmp",
default=os.getenv("LIBRETEXTS_TMP", "tmp"),
default=os.getenv("LIBRETEXTS_TMP", tmpdir),
dest="tmp_folder",
)

Expand Down Expand Up @@ -248,4 +248,5 @@ def main() -> None:


if __name__ == "__main__":
main()
with tempfile.TemporaryDirectory() as tmpdir:
main(tmpdir)

0 comments on commit 4b65edf

Please sign in to comment.