From a1352e86d55038bf5d770fe26c62a0bf5c8bfcbd Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 24 Oct 2024 09:45:09 +0000 Subject: [PATCH 1/5] Move code around to isolate page processing logic --- scraper/src/mindtouch2zim/processor.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 6d9df87..d6168f4 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -373,15 +373,7 @@ def run(self) -> Path: logger.info("Fetching pages content") for page in selected_pages: - logger.debug(f" Fetching {page.id}") - page_content = self.mindtouch_client.get_page_content(page) - add_item_for( - creator, - f"content/page_content_{page.id}.json", - content=PageContentModel( - html_body=page_content.html_body - ).model_dump_json(by_alias=True), - ) + self._process_page(creator=creator, page=page) return zim_path @@ -416,3 +408,17 @@ def _process_css( **url_rewriter.items_to_download, } add_item_for(creator, f"content/{target_filename}", content=result) + + def _process_page(self, creator: Creator, page: LibraryPage): + """Process a given library page + Download content, rewrite HTML and add JSON to ZIM + """ + logger.debug(f" Fetching {page.id}") + page_content = self.mindtouch_client.get_page_content(page) + add_item_for( + creator, + f"content/page_content_{page.id}.json", + content=PageContentModel(html_body=page_content.html_body).model_dump_json( + by_alias=True + ), + ) From 3da37ddc8e983b48d4b72639345c53f5694e0976 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 07:36:13 +0000 Subject: [PATCH 2/5] Download assets at the end and make it generic --- scraper/src/mindtouch2zim/processor.py | 39 +++++++++++++------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index d6168f4..c98f011 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -331,26 +331,6 @@ def run(self) -> Path: creator=creator, ) - logger.info(f" Retrieving {len(self.items_to_download)} CSS assets...") - for asset_path, asset_url in self.items_to_download.items(): - try: - css_asset = BytesIO() - stream_file(asset_url.value, byte_stream=css_asset) - logger.debug( - f"Adding {asset_url.value} to {asset_path.value} in the ZIM" - ) - add_item_for( - creator, - "content/" + asset_path.value, - content=css_asset.getvalue(), - ) - del css_asset - except HTTPError as exc: - # would make more sense to be a warning, but this is just too - # verbose, at least on geo.libretexts.org many assets are just - # missing - logger.debug(f"Ignoring {asset_path.value} due to {exc}") - logger.info("Fetching pages tree") pages_tree = self.mindtouch_client.get_page_tree() selected_pages = self.content_filter.filter(pages_tree) @@ -375,6 +355,25 @@ def run(self) -> Path: for page in selected_pages: self._process_page(creator=creator, page=page) + logger.info(f" Retrieving {len(self.items_to_download)} assets...") + for asset_path, asset_url in self.items_to_download.items(): + try: + asset_content = BytesIO() + stream_file(asset_url.value, byte_stream=asset_content) + logger.debug( + f"Adding {asset_url.value} to {asset_path.value} in the ZIM" + ) + add_item_for( + creator, + "content/" + asset_path.value, + content=asset_content.getvalue(), + ) + except HTTPError as exc: + # would make more sense to be a warning, but this is just too + # verbose, at least on geo.libretexts.org many assets are just + # missing + logger.debug(f"Ignoring {asset_path.value} due to {exc}") + return zim_path def _process_css( From 18833177950dec6d669be62033386bb36418d796 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 07:39:51 +0000 Subject: [PATCH 3/5] Ensure library URL does not contains a trailing slash --- scraper/src/mindtouch2zim/entrypoint.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py index 3508efc..2026e61 100644 --- a/scraper/src/mindtouch2zim/entrypoint.py +++ b/scraper/src/mindtouch2zim/entrypoint.py @@ -152,8 +152,9 @@ def main(tmpdir: str) -> None: # Client configuration flags parser.add_argument( "--library-url", - help="URL of the Mindtouch / Nice CXone Expert instance, e.g. for LibreTexts " - "Geosciences it is https://geo.libretexts.org/", + help="URL of the Mindtouch / Nice CXone Expert instance (must NOT contain " + "trailing slash), e.g. for LibreTexts Geosciences it is " + "https://geo.libretexts.org", required=True, ) @@ -217,6 +218,8 @@ def main(tmpdir: str) -> None: tmp_folder.mkdir(exist_ok=True) validate_zimfile_creatable(tmp_folder, "test.txt") + library_url = str(args.library_url).rstrip("/") + try: zim_config = ZimConfig.of(args) doc_filter = ContentFilter.of(args) @@ -225,7 +228,7 @@ def main(tmpdir: str) -> None: cache_folder.mkdir(exist_ok=True) mindtouch_client = MindtouchClient( - library_url=args.library_url, + library_url=library_url, cache_folder=cache_folder, ) From 5f14fe49dff882cfd02158d21b4044c498bc1646 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 07:47:13 +0000 Subject: [PATCH 4/5] Adapt to zimscraperlib not storing items to download anymore --- scraper/pyproject.toml | 2 +- scraper/src/mindtouch2zim/processor.py | 85 ++++++++++++++++++-------- 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 28b5e54..801877d 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "yt-dlp", # youtube-dl should be updated as frequently as possible "jinja2==3.1.4", # use zimscraperlib pinned version once content rewriting functions have been released - "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main", + "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@mindtouch_changes", "requests==2.32.3", "types-requests==2.32.0.20240914", "kiwixstorage==0.9.0", diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index c98f011..407dc82 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -14,6 +14,7 @@ from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, HttpUrl, + RewriteResult, ZimPath, ) from zimscraperlib.zim import Creator @@ -313,7 +314,7 @@ def run(self) -> Path: add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image - self.items_to_download: dict[ZimPath, HttpUrl] = {} + self.items_to_download: dict[ZimPath, set[HttpUrl]] = {} self._process_css( css_location=home.screen_css_url, target_filename="screen.css", @@ -356,23 +357,25 @@ def run(self) -> Path: self._process_page(creator=creator, page=page) logger.info(f" Retrieving {len(self.items_to_download)} assets...") - for asset_path, asset_url in self.items_to_download.items(): - try: - asset_content = BytesIO() - stream_file(asset_url.value, byte_stream=asset_content) - logger.debug( - f"Adding {asset_url.value} to {asset_path.value} in the ZIM" - ) - add_item_for( - creator, - "content/" + asset_path.value, - content=asset_content.getvalue(), - ) - except HTTPError as exc: - # would make more sense to be a warning, but this is just too - # verbose, at least on geo.libretexts.org many assets are just - # missing - logger.debug(f"Ignoring {asset_path.value} due to {exc}") + for asset_path, asset_urls in self.items_to_download.items(): + for asset_url in asset_urls: + try: + asset_content = BytesIO() + stream_file(asset_url.value, byte_stream=asset_content) + logger.debug( + f"Adding {asset_url.value} to {asset_path.value} in the ZIM" + ) + add_item_for( + creator, + "content/" + asset_path.value, + content=asset_content.getvalue(), + ) + break # file found and added + except HTTPError as exc: + # would make more sense to be a warning, but this is just too + # verbose, at least on geo.libretexts.org many assets are just + # missing + logger.debug(f"Ignoring {asset_path.value} due to {exc}") return zim_path @@ -392,20 +395,23 @@ def _process_css( css_buffer = BytesIO() stream_file(css_location, byte_stream=css_buffer) css_content = css_buffer.getvalue() - url_rewriter = ArticleUrlRewriter( + url_rewriter = CssUrlsRewriter( article_url=HttpUrl(css_location), article_path=ZimPath(target_filename), ) - css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) + css_rewriter = CssRewriter( + url_rewriter=url_rewriter, base_href=None, remove_errors=True + ) result = css_rewriter.rewrite(content=css_content) # Rebuild the dict since we might have "conflict" of ZimPath (two urls leading # to the same ZimPath) and we prefer to use the first URL encountered, where # using self.items_to_download.update while override the key value, prefering # to use last URL encountered. - self.items_to_download = { - **self.items_to_download, - **url_rewriter.items_to_download, - } + for path, urls in url_rewriter.items_to_download.items(): + if path in self.items_to_download: + self.items_to_download[path].update(urls) + else: + self.items_to_download[path] = urls add_item_for(creator, f"content/{target_filename}", content=result) def _process_page(self, creator: Creator, page: LibraryPage): @@ -421,3 +427,34 @@ def _process_page(self, creator: Creator, page: LibraryPage): by_alias=True ), ) + + +class CssUrlsRewriter(ArticleUrlRewriter): + + def __init__( + self, + *, + article_url: HttpUrl, + article_path: ZimPath, + ): + super().__init__( + article_url=article_url, + article_path=article_path, + ) + self.items_to_download: dict[ZimPath, set[HttpUrl]] = {} + + def __call__( + self, + item_url: str, + base_href: str | None, + *, + rewrite_all_url: bool = True, # noqa: ARG002 + ) -> RewriteResult: + result = super().__call__(item_url, base_href, rewrite_all_url=True) + if result.zim_path is None: + return result + if result.zim_path in self.items_to_download: + self.items_to_download[result.zim_path].add(HttpUrl(result.absolute_url)) + else: + self.items_to_download[result.zim_path] = {HttpUrl(result.absolute_url)} + return result From 8f68bf969e96f7e111d11f33ec47db058eef06ff Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 08:04:27 +0000 Subject: [PATCH 5/5] Rewrite HTML to fix and and download images --- scraper/src/mindtouch2zim/processor.py | 140 ++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 3 deletions(-) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 407dc82..0704a50 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -11,6 +11,8 @@ ) from zimscraperlib.image import resize_image from zimscraperlib.rewriting.css import CssRewriter +from zimscraperlib.rewriting.html import HtmlRewriter +from zimscraperlib.rewriting.html import rules as html_rules from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, HttpUrl, @@ -49,6 +51,12 @@ class MissingDocumentError(Exception): pass +class UnsupportedTagError(Exception): + """An exception raised when an HTML tag is not expected to be encountered""" + + pass + + class ContentFilter(BaseModel): """Supports filtering documents by user provided attributes.""" @@ -353,8 +361,18 @@ def run(self) -> Path: ) logger.info("Fetching pages content") + # compute the list of existing pages to properly rewrite links leading + # in-ZIM / out-of-ZIM + existing_html_pages = { + ArticleUrlRewriter.normalize( + HttpUrl(f"{self.mindtouch_client.library_url}/{page.path}") + ) + for page in selected_pages + } for page in selected_pages: - self._process_page(creator=creator, page=page) + self._process_page( + creator=creator, page=page, existing_zim_paths=existing_html_pages + ) logger.info(f" Retrieving {len(self.items_to_download)} assets...") for asset_path, asset_urls in self.items_to_download.items(): @@ -414,22 +432,138 @@ def _process_css( self.items_to_download[path] = urls add_item_for(creator, f"content/{target_filename}", content=result) - def _process_page(self, creator: Creator, page: LibraryPage): + def _process_page( + self, creator: Creator, page: LibraryPage, existing_zim_paths: set[ZimPath] + ): """Process a given library page Download content, rewrite HTML and add JSON to ZIM """ logger.debug(f" Fetching {page.id}") page_content = self.mindtouch_client.get_page_content(page) + url_rewriter = HtmlUrlsRewriter( + self.mindtouch_client.library_url, + page, + existing_zim_paths=existing_zim_paths, + ) + rewriter = HtmlRewriter( + url_rewriter=url_rewriter, + pre_head_insert=None, + post_head_insert=None, + notify_js_module=None, + ) + rewriten = rewriter.rewrite(page_content.html_body) + for path, urls in url_rewriter.items_to_download.items(): + if path in self.items_to_download: + self.items_to_download[path].update(urls) + else: + self.items_to_download[path] = urls add_item_for( creator, f"content/page_content_{page.id}.json", - content=PageContentModel(html_body=page_content.html_body).model_dump_json( + content=PageContentModel(html_body=rewriten.content).model_dump_json( by_alias=True ), ) +# remove all standard rules, they are not adapted to Vue.JS UI +html_rules.rewrite_attribute_rules.clear() +html_rules.rewrite_data_rules.clear() +html_rules.rewrite_tag_rules.clear() + + +@html_rules.rewrite_attribute() +def rewrite_href_src_attributes( + tag: str, + attr_name: str, + attr_value: str | None, + url_rewriter: ArticleUrlRewriter, + base_href: str | None, +): + """Rewrite href and src attributes""" + if attr_name not in ("href", "src") or not attr_value: + return + if not isinstance(url_rewriter, HtmlUrlsRewriter): + raise Exception("Expecting MindtouchUrlRewriter") + new_attr_value = None + if tag == "a": + rewrite_result = url_rewriter( + attr_value, base_href=base_href, rewrite_all_url=False + ) + # rewrite links for proper navigation inside ZIM Vue.JS UI (if inside ZIM) or + # full link (if outside the current library) + new_attr_value = ( + f"#/{rewrite_result.rewriten_url[len(url_rewriter.library_path.value) :]}" + if rewrite_result.rewriten_url.startswith(url_rewriter.library_path.value) + else rewrite_result.rewriten_url + ) + if tag == "img": + rewrite_result = url_rewriter( + attr_value, base_href=base_href, rewrite_all_url=True + ) + # add 'content/' to the URL since all assets will be stored in the sub.-path + new_attr_value = f"content/{rewrite_result.rewriten_url}" + if rewrite_result.zim_path is not None: + # if item is expected to be inside the ZIM, store asset information so that + # we can download it afterwards + if rewrite_result.zim_path in url_rewriter.items_to_download: + url_rewriter.items_to_download[rewrite_result.zim_path].add( + HttpUrl(rewrite_result.absolute_url) + ) + else: + url_rewriter.items_to_download[rewrite_result.zim_path] = { + HttpUrl(rewrite_result.absolute_url) + } + if not new_attr_value: + # we do not (yet) support other tags / attributes so we fail the scraper + raise ValueError( + f"Empty new value when rewriting {attr_value} from {attr_name} in {tag} tag" + ) + return (attr_name, new_attr_value) + + +@html_rules.drop_attribute() +def drop_sizes_and_srcset_attribute(tag: str, attr_name: str): + """Drop srcset and sizes attributes in tags""" + return tag == "img" and attr_name in ("srcset", "sizes") + + +@html_rules.rewrite_tag() +def refuse_unsupported_tags(tag: str): + """Stop scraper if unsupported tag is encountered""" + if tag not in ["picture"]: + return + raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper") + + +class HtmlUrlsRewriter(ArticleUrlRewriter): + """A rewriter for HTML processing + + This rewriter does not store items to download on-the-fly but has containers and + metadata so that HTML rewriting rules can decide what needs to be downloaded + """ + + def __init__( + self, library_url: str, page: LibraryPage, existing_zim_paths: set[ZimPath] + ): + super().__init__( + article_url=HttpUrl(f"{library_url}/{page.path}"), + article_path=ZimPath("index.html"), + existing_zim_paths=existing_zim_paths, + ) + self.library_url = library_url + self.library_path = ArticleUrlRewriter.normalize(HttpUrl(f"{library_url}/")) + self.items_to_download: dict[ZimPath, set[HttpUrl]] = {} + + def __call__( + self, item_url: str, base_href: str | None, *, rewrite_all_url: bool = True + ) -> RewriteResult: + result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url) + return result + + class CssUrlsRewriter(ArticleUrlRewriter): + """A rewriter for CSS processing, storing items to download as URL as processed""" def __init__( self,