Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch only the required subtree of the website instead of the whole site #102

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 74 additions & 27 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"""

tags: list[str]
parent_id: str | None


class LibraryPage(BaseModel):
Expand Down Expand Up @@ -238,10 +239,10 @@
)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:
def get_page_tree(self, page: str = "home") -> LibraryTree:

tree_data = self._get_api_json(
"/pages/home/tree", timeout=context.http_timeout_long_seconds
f"/pages/{page}/tree", timeout=context.http_timeout_long_seconds
)

root = LibraryPage(
Expand Down Expand Up @@ -306,32 +307,43 @@
)
return LibraryPageContent(html_body=tree["body"][0])

def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition:
"""Return the definition of a given page

Definition is kept in memory, and retrieved on-demand when it is not yet there
"""
if page.definition is None:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=context.http_timeout_normal_seconds
)
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
raw_tag = raw_tags.get("tag", None)
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page.id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
tags=tags,
)
return page.definition

def get_cover_page(self, page: LibraryPage) -> LibraryPage:
"""Get the cover page of a given page
if isinstance(page, str):
page_id = page

Check warning on line 317 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L317

Added line #L317 was not covered by tests
elif page.definition is not None:
return page.definition

Check warning on line 319 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L319

Added line #L319 was not covered by tests
else:
page_id = page.id

Check warning on line 321 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L321

Added line #L321 was not covered by tests

raw_definition = self._get_api_json(

Check warning on line 323 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L323

Added line #L323 was not covered by tests
f"/pages/{page_id}", timeout=context.http_timeout_normal_seconds
)
raw_tag = raw_definition.get("tags", {}).get("tag", None)

Check warning on line 326 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L326

Added line #L326 was not covered by tests
if raw_tag is None:
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
raise MindtouchParsingError(f"No tag property for page {page_id}")

Check warning on line 328 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L328

Added line #L328 was not covered by tests
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]

Check warning on line 330 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L330

Added line #L330 was not covered by tests
else:
tags = [raw_tag.get("@value")]

Check warning on line 332 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L332

Added line #L332 was not covered by tests

parent = raw_definition.get("page.parent", None)

Check warning on line 334 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L334

Added line #L334 was not covered by tests

page_definition = LibraryPageDefinition(

Check warning on line 336 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L336

Added line #L336 was not covered by tests
tags=tags, parent_id=None if parent is None else parent["@id"]
)

if isinstance(page, LibraryPage):
page.definition = page_definition

Check warning on line 341 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L341

Added line #L341 was not covered by tests

return page_definition

Check warning on line 343 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L343

Added line #L343 was not covered by tests

def get_cover_page(self, page: LibraryPage) -> LibraryPage | None:
"""Get the cover page of a given page object

Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js
Expand All @@ -350,19 +362,54 @@
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
return None

Check warning on line 366 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L366

Added line #L366 was not covered by tests
if current_page.parent is None:
raise MindtouchParsingError(
f"No more parent for {page.id}, reached root at {current_page.id}"
)
current_page = current_page.parent

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
"""Get the cover page ID of a given page identifier as string

Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js

Probably originates from getCoverpage function of
https://github.com/LibreTexts/Libretext/blob/master/public/Miscellaneous/reuse.js

See https://github.com/openzim/mindtouch/issues/68 for a copy of original code
"""
current_page = page_id
while True:
current_definition = self.get_page_definition(current_page)

Check warning on line 386 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L384-L386

Added lines #L384 - L386 were not covered by tests
if (
"coverpage:yes" in current_definition.tags
or "coverpage:toc" in current_definition.tags
or "coverpage:nocommons" in current_definition.tags
):
return current_page

Check warning on line 392 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L392

Added line #L392 was not covered by tests
if "article:topic-category" in current_definition.tags:
return None

Check warning on line 394 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L394

Added line #L394 was not covered by tests
if current_definition.parent_id is None:
raise MindtouchParsingError(

Check warning on line 396 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L396

Added line #L396 was not covered by tests
f"No more parent for {page_id}, reached root at {current_page}"
)
current_page = current_definition.parent_id

Check warning on line 399 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L399

Added line #L399 was not covered by tests

def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None:
"""Returns the url for the book page for a given child page"""
return self.get_cover_page(page).encoded_url
cover_page = self.get_cover_page(page)
return cover_page.encoded_url if cover_page is not None else None

Check warning on line 404 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L403-L404

Added lines #L403 - L404 were not covered by tests

def get_cover_page_id(self, page: LibraryPage) -> str:
def get_cover_page_id(self, page: LibraryPage | str) -> str | None:
"""Returns the id for the book page for a given child page"""
return self.get_cover_page(page).id
if isinstance(page, LibraryPage):
cover_page = self.get_cover_page(page)
return cover_page.id if cover_page is not None else None

Check warning on line 410 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L409-L410

Added lines #L409 - L410 were not covered by tests
else:
return self._get_cover_page_from_str_id(page)

Check warning on line 412 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L412

Added line #L412 was not covered by tests

def get_template_content(self, page_id: str, template: str) -> str:
"""Returns the templated content of a given page"""
Expand Down
8 changes: 5 additions & 3 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context
from mindtouch2zim.libretexts.errors import BadBookPageError

context = Context.get()

Expand Down Expand Up @@ -87,11 +88,12 @@

"""

cover_page_url = mindtouch_client.get_cover_page_encoded_url(page)

Check warning on line 91 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L91

Added line #L91 was not covered by tests
if cover_page_url is None:
raise BadBookPageError()

Check warning on line 93 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L93

Added line #L93 was not covered by tests
return rewriter.rewrite(
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
licensing_data=_get_licensing_report_data(cover_page_url),
)
).content
4 changes: 4 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class BadBookPageError(Exception):
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
"""Raised when we are processing a special book page but we are not inside a book"""

pass
6 changes: 5 additions & 1 deletion scraper/src/mindtouch2zim/libretexts/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.libretexts.errors import BadBookPageError


class IndexPage(BaseModel):
Expand All @@ -28,11 +29,14 @@
page: LibraryPage,
) -> str:
"""Get and rewrite index HTML"""
cover_page_id = mindtouch_client.get_cover_page_id(page)

Check warning on line 32 in scraper/src/mindtouch2zim/libretexts/index.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/index.py#L32

Added line #L32 was not covered by tests
if cover_page_id is None:
raise BadBookPageError()

Check warning on line 34 in scraper/src/mindtouch2zim/libretexts/index.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/index.py#L34

Added line #L34 was not covered by tests
return get_libretexts_transformed_html(
jinja2_template=jinja2_template,
libretexts_template_content=rewriter.rewrite(
mindtouch_client.get_template_content(
page_id=mindtouch_client.get_cover_page_id(page),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)
).content,
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,12 @@

logger.info("Fetching pages tree")
context.current_thread_workitem = "pages tree"
pages_tree = self.mindtouch_client.get_page_tree()
root_page_id = self.content_filter.root_page_id or "home"
cover_page_id = (

Check warning on line 375 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L374-L375

Added lines #L374 - L375 were not covered by tests
self.mindtouch_client.get_cover_page_id(root_page_id)
or root_page_id # if --root-page-id is not inside a book but a category
)
pages_tree = self.mindtouch_client.get_page_tree(cover_page_id)

Check warning on line 379 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L379

Added line #L379 was not covered by tests
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
Expand Down
82 changes: 65 additions & 17 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,46 @@


@pytest.fixture(scope="module")
def client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
def raw_client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
context.library_url = libretexts_url
context.cache_folder = cache_folder
return MindtouchClient()


@pytest.fixture(scope="module")
def client(
raw_client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> MindtouchClient:
"""already authenticated client (avoid having to fetch deki_token in tests)"""
return raw_client


@pytest.fixture(scope="module")
def home(client: MindtouchClient) -> MindtouchHome:
return client.get_home()


@pytest.fixture(scope="module")
def deki_token(client: MindtouchClient) -> str:
return client.get_deki_token()
def deki_token(raw_client: MindtouchClient) -> str:
return raw_client.get_deki_token()


@pytest.fixture(scope="module")
def minimum_number_of_pages() -> int:
return 8000


@pytest.fixture(scope="module")
def somewhere_page_id() -> LibraryPageId:
return "15728"


@pytest.fixture(scope="module")
def nb_somewhere_children() -> int:
return 5


@pytest.fixture(scope="module")
def root_page_id() -> LibraryPageId:
return "34"
Expand All @@ -52,7 +71,6 @@ def nb_root_children() -> int:
@pytest.fixture(scope="module")
def page_tree(
client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> LibraryTree:
return client.get_page_tree()

Expand All @@ -65,20 +83,11 @@ def test_get_deki_token(deki_token: str):
def test_get_all_pages_ids(
client: MindtouchClient,
minimum_number_of_pages: int,
deki_token: str, # noqa: ARG001
):
pages_ids = client.get_all_pages_ids()
assert len(pages_ids) > minimum_number_of_pages


def test_get_root_page_id(
client: MindtouchClient,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: LibraryTree,
minimum_number_of_pages: int,
Expand Down Expand Up @@ -114,6 +123,19 @@ def test_get_page_tree_subtree(
assert len(subtree2.pages.keys()) == 94


def test_get_page_tree_somewhere(
client: MindtouchClient,
somewhere_page_id: str,
nb_somewhere_children: int,
):
page_tree = client.get_page_tree(somewhere_page_id)
assert page_tree.root.id == somewhere_page_id
assert len(page_tree.root.children) == nb_somewhere_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_home_image_url(home: MindtouchHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down Expand Up @@ -146,8 +168,10 @@ def test_get_index_page_from_template(
):
"""Ensures we can get content of an index page"""
page_15837 = page_tree.sub_tree("15837").root
cover_page_id = client.get_cover_page_id(page_15837)
assert cover_page_id
assert client.get_template_content(
page_id=client.get_cover_page_id(page_15837),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)

Expand All @@ -164,12 +188,36 @@ def test_get_cover_page_encoded_url(
)


def test_get_cover_page_id(
@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[
("15837", "15718"),
(":0794f6ff8238481ab880b6484deb65f4", "15718"),
("15844", None),
("34", None),
("home", None),
],
)
def test_get_cover_page_id_by_id(
client: MindtouchClient,
current_id: str,
expected_cover_page_id: str | None,
):
assert client.get_cover_page_id(current_id) == expected_cover_page_id


@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[("15837", "15718"), ("15844", None), ("34", None)],
)
def test_get_cover_page_id_by_page(
client: MindtouchClient,
page_tree: LibraryTree,
current_id: str,
expected_cover_page_id: str | None,
):
page_15837 = page_tree.sub_tree("15837").root
assert client.get_cover_page_id(page_15837) == "15718"
page_object = page_tree.sub_tree(current_id).root
assert client.get_cover_page_id(page_object) == expected_cover_page_id


def test_get_home_screen_css_url(home: MindtouchHome):
Expand Down