From cb1f2f7cc84a6d5002e3226e9e79269deaf1ad34 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Nov 2024 10:17:33 +0000 Subject: [PATCH] Fetch only the required subtree of the website instead of the whole site --- scraper/src/mindtouch2zim/client.py | 101 +++++++++++++----- .../libretexts/detailed_licensing.py | 8 +- .../src/mindtouch2zim/libretexts/errors.py | 4 + scraper/src/mindtouch2zim/libretexts/index.py | 6 +- scraper/src/mindtouch2zim/processor.py | 7 +- scraper/tests-integration/test_client.py | 82 +++++++++++--- 6 files changed, 159 insertions(+), 49 deletions(-) create mode 100644 scraper/src/mindtouch2zim/libretexts/errors.py diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py index 08df707..bc75142 100644 --- a/scraper/src/mindtouch2zim/client.py +++ b/scraper/src/mindtouch2zim/client.py @@ -36,6 +36,7 @@ class LibraryPageDefinition(BaseModel): """ tags: list[str] + parent_id: str | None class LibraryPage(BaseModel): @@ -238,10 +239,10 @@ def get_root_page_id(self) -> LibraryPageId: ) return tree["page"]["@id"] - def get_page_tree(self) -> LibraryTree: + def get_page_tree(self, page: str = "home") -> LibraryTree: tree_data = self._get_api_json( - "/pages/home/tree", timeout=context.http_timeout_long_seconds + f"/pages/{page}/tree", timeout=context.http_timeout_long_seconds ) root = LibraryPage( @@ -306,32 +307,43 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent: ) return LibraryPageContent(html_body=tree["body"][0]) - def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition: + def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition: """Return the definition of a given page Definition is kept in memory, and retrieved on-demand when it is not yet there """ - if page.definition is None: - raw_definition = self._get_api_json( - f"/pages/{page.id}", timeout=context.http_timeout_normal_seconds - ) - raw_tags = raw_definition.get("tags", None) - if raw_tags is None: - raise MindtouchParsingError(f"No tags property for page {page.id}") - raw_tag = raw_tags.get("tag", None) - if raw_tag is None: - raise MindtouchParsingError(f"No tag property for page {page.id}") - if isinstance(raw_tag, list): - tags = [item.get("@value") for item in raw_tag] - else: - tags = [raw_tag.get("@value")] - page.definition = LibraryPageDefinition( - tags=tags, - ) - return page.definition - def get_cover_page(self, page: LibraryPage) -> LibraryPage: - """Get the cover page of a given page + if isinstance(page, str): + page_id = page + elif page.definition is not None: + return page.definition + else: + page_id = page.id + + raw_definition = self._get_api_json( + f"/pages/{page_id}", timeout=context.http_timeout_normal_seconds + ) + raw_tag = raw_definition.get("tags", {}).get("tag", None) + if raw_tag is None: + raise MindtouchParsingError(f"No tag property for page {page_id}") + if isinstance(raw_tag, list): + tags = [item.get("@value") for item in raw_tag] + else: + tags = [raw_tag.get("@value")] + + parent = raw_definition.get("page.parent", None) + + page_definition = LibraryPageDefinition( + tags=tags, parent_id=None if parent is None else parent["@id"] + ) + + if isinstance(page, LibraryPage): + page.definition = page_definition + + return page_definition + + def get_cover_page(self, page: LibraryPage) -> LibraryPage | None: + """Get the cover page of a given page object Logic originally defined in `getCoverpage` function of https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js @@ -350,19 +362,54 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage: or "coverpage:nocommons" in current_definition.tags ): return current_page + if "article:topic-category" in current_definition.tags: + return None if current_page.parent is None: raise MindtouchParsingError( f"No more parent for {page.id}, reached root at {current_page.id}" ) current_page = current_page.parent - def get_cover_page_encoded_url(self, page: LibraryPage) -> str: + def _get_cover_page_from_str_id(self, page_id: str) -> str | None: + """Get the cover page ID of a given page identifier as string + + Logic originally defined in `getCoverpage` function of + https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js + + Probably originates from getCoverpage function of + https://github.com/LibreTexts/Libretext/blob/master/public/Miscellaneous/reuse.js + + See https://github.com/openzim/mindtouch/issues/68 for a copy of original code + """ + current_page = page_id + while True: + current_definition = self.get_page_definition(current_page) + if ( + "coverpage:yes" in current_definition.tags + or "coverpage:toc" in current_definition.tags + or "coverpage:nocommons" in current_definition.tags + ): + return current_page + if "article:topic-category" in current_definition.tags: + return None + if current_definition.parent_id is None: + raise MindtouchParsingError( + f"No more parent for {page_id}, reached root at {current_page}" + ) + current_page = current_definition.parent_id + + def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None: """Returns the url for the book page for a given child page""" - return self.get_cover_page(page).encoded_url + cover_page = self.get_cover_page(page) + return cover_page.encoded_url if cover_page is not None else None - def get_cover_page_id(self, page: LibraryPage) -> str: + def get_cover_page_id(self, page: LibraryPage | str) -> str | None: """Returns the id for the book page for a given child page""" - return self.get_cover_page(page).id + if isinstance(page, LibraryPage): + cover_page = self.get_cover_page(page) + return cover_page.id if cover_page is not None else None + else: + return self._get_cover_page_from_str_id(page) def get_template_content(self, page_id: str, template: str) -> str: """Returns the templated content of a given page""" diff --git a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py index f8575c5..a9356a4 100644 --- a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py +++ b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py @@ -7,6 +7,7 @@ from mindtouch2zim.client import LibraryPage, MindtouchClient from mindtouch2zim.constants import logger from mindtouch2zim.context import Context +from mindtouch2zim.libretexts.errors import BadBookPageError context = Context.get() @@ -87,11 +88,12 @@ def rewrite_detailed_licensing( """ + cover_page_url = mindtouch_client.get_cover_page_encoded_url(page) + if cover_page_url is None: + raise BadBookPageError() return rewriter.rewrite( _render_html_from_data( jinja2_template=jinja2_template, - licensing_data=_get_licensing_report_data( - mindtouch_client.get_cover_page_encoded_url(page) - ), + licensing_data=_get_licensing_report_data(cover_page_url), ) ).content diff --git a/scraper/src/mindtouch2zim/libretexts/errors.py b/scraper/src/mindtouch2zim/libretexts/errors.py new file mode 100644 index 0000000..42e4495 --- /dev/null +++ b/scraper/src/mindtouch2zim/libretexts/errors.py @@ -0,0 +1,4 @@ +class BadBookPageError(Exception): + """Raised when we are processing a special book page but we are not inside a book""" + + pass diff --git a/scraper/src/mindtouch2zim/libretexts/index.py b/scraper/src/mindtouch2zim/libretexts/index.py index 5ca74b3..e823291 100644 --- a/scraper/src/mindtouch2zim/libretexts/index.py +++ b/scraper/src/mindtouch2zim/libretexts/index.py @@ -4,6 +4,7 @@ from zimscraperlib.rewriting.html import HtmlRewriter from mindtouch2zim.client import LibraryPage, MindtouchClient +from mindtouch2zim.libretexts.errors import BadBookPageError class IndexPage(BaseModel): @@ -28,11 +29,14 @@ def rewrite_index( page: LibraryPage, ) -> str: """Get and rewrite index HTML""" + cover_page_id = mindtouch_client.get_cover_page_id(page) + if cover_page_id is None: + raise BadBookPageError() return get_libretexts_transformed_html( jinja2_template=jinja2_template, libretexts_template_content=rewriter.rewrite( mindtouch_client.get_template_content( - page_id=mindtouch_client.get_cover_page_id(page), + page_id=cover_page_id, template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory", ) ).content, diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 7f47135..53ee8d6 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -371,7 +371,12 @@ def run_with_creator(self, creator: Creator): logger.info("Fetching pages tree") context.current_thread_workitem = "pages tree" - pages_tree = self.mindtouch_client.get_page_tree() + root_page_id = self.content_filter.root_page_id or "home" + cover_page_id = ( + self.mindtouch_client.get_cover_page_id(root_page_id) + or root_page_id # if --root-page-id is not inside a book but a category + ) + pages_tree = self.mindtouch_client.get_page_tree(cover_page_id) selected_pages = self.content_filter.filter(pages_tree) logger.info( f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be " diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index 2c05b7d..f1277e3 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -18,20 +18,29 @@ @pytest.fixture(scope="module") -def client(libretexts_url: str, cache_folder: Path) -> MindtouchClient: +def raw_client(libretexts_url: str, cache_folder: Path) -> MindtouchClient: context.library_url = libretexts_url context.cache_folder = cache_folder return MindtouchClient() +@pytest.fixture(scope="module") +def client( + raw_client: MindtouchClient, + deki_token: str, # noqa: ARG001 +) -> MindtouchClient: + """already authenticated client (avoid having to fetch deki_token in tests)""" + return raw_client + + @pytest.fixture(scope="module") def home(client: MindtouchClient) -> MindtouchHome: return client.get_home() @pytest.fixture(scope="module") -def deki_token(client: MindtouchClient) -> str: - return client.get_deki_token() +def deki_token(raw_client: MindtouchClient) -> str: + return raw_client.get_deki_token() @pytest.fixture(scope="module") @@ -39,6 +48,16 @@ def minimum_number_of_pages() -> int: return 8000 +@pytest.fixture(scope="module") +def somewhere_page_id() -> LibraryPageId: + return "15728" + + +@pytest.fixture(scope="module") +def nb_somewhere_children() -> int: + return 5 + + @pytest.fixture(scope="module") def root_page_id() -> LibraryPageId: return "34" @@ -52,7 +71,6 @@ def nb_root_children() -> int: @pytest.fixture(scope="module") def page_tree( client: MindtouchClient, - deki_token: str, # noqa: ARG001 ) -> LibraryTree: return client.get_page_tree() @@ -65,20 +83,11 @@ def test_get_deki_token(deki_token: str): def test_get_all_pages_ids( client: MindtouchClient, minimum_number_of_pages: int, - deki_token: str, # noqa: ARG001 ): pages_ids = client.get_all_pages_ids() assert len(pages_ids) > minimum_number_of_pages -def test_get_root_page_id( - client: MindtouchClient, - root_page_id: LibraryPageId, - deki_token: str, # noqa: ARG001 -): - assert client.get_root_page_id() == root_page_id - - def test_get_page_tree_pages( page_tree: LibraryTree, minimum_number_of_pages: int, @@ -114,6 +123,19 @@ def test_get_page_tree_subtree( assert len(subtree2.pages.keys()) == 94 +def test_get_page_tree_somewhere( + client: MindtouchClient, + somewhere_page_id: str, + nb_somewhere_children: int, +): + page_tree = client.get_page_tree(somewhere_page_id) + assert page_tree.root.id == somewhere_page_id + assert len(page_tree.root.children) == nb_somewhere_children + assert page_tree.root.title + for child in page_tree.root.children: + assert child.title + + def test_get_home_image_url(home: MindtouchHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png" @@ -146,8 +168,10 @@ def test_get_index_page_from_template( ): """Ensures we can get content of an index page""" page_15837 = page_tree.sub_tree("15837").root + cover_page_id = client.get_cover_page_id(page_15837) + assert cover_page_id assert client.get_template_content( - page_id=client.get_cover_page_id(page_15837), + page_id=cover_page_id, template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory", ) @@ -164,12 +188,36 @@ def test_get_cover_page_encoded_url( ) -def test_get_cover_page_id( +@pytest.mark.parametrize( + "current_id, expected_cover_page_id", + [ + ("15837", "15718"), + (":0794f6ff8238481ab880b6484deb65f4", "15718"), + ("15844", None), + ("34", None), + ("home", None), + ], +) +def test_get_cover_page_id_by_id( + client: MindtouchClient, + current_id: str, + expected_cover_page_id: str | None, +): + assert client.get_cover_page_id(current_id) == expected_cover_page_id + + +@pytest.mark.parametrize( + "current_id, expected_cover_page_id", + [("15837", "15718"), ("15844", None), ("34", None)], +) +def test_get_cover_page_id_by_page( client: MindtouchClient, page_tree: LibraryTree, + current_id: str, + expected_cover_page_id: str | None, ): - page_15837 = page_tree.sub_tree("15837").root - assert client.get_cover_page_id(page_15837) == "15718" + page_object = page_tree.sub_tree(current_id).root + assert client.get_cover_page_id(page_object) == expected_cover_page_id def test_get_home_screen_css_url(home: MindtouchHome):