From 86ba7ebce85ce5be766a1316e4966a898fbf1920 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 15 Oct 2024 14:34:19 +0000 Subject: [PATCH 1/2] Use scraperlib content rewriting code (imported from warc2zim) --- scraper/pyproject.toml | 3 +- scraper/src/libretexts2zim/css.py | 143 ----------- scraper/src/libretexts2zim/processor.py | 53 ++-- scraper/tests/test_css.py | 310 ------------------------ 4 files changed, 40 insertions(+), 469 deletions(-) delete mode 100644 scraper/src/libretexts2zim/css.py delete mode 100644 scraper/tests/test_css.py diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 0322b7d..489d28a 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -10,7 +10,8 @@ readme = "../README.md" dependencies = [ "yt-dlp", # youtube-dl should be updated as frequently as possible "jinja2==3.1.4", - "zimscraperlib==4.0.0", + # use zimscraperlib pinned version once content rewriting functions have been released + "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main", "requests==2.32.3", "types-requests==2.32.0.20240914", "kiwixstorage==0.9.0", diff --git a/scraper/src/libretexts2zim/css.py b/scraper/src/libretexts2zim/css.py deleted file mode 100644 index f11cc9e..0000000 --- a/scraper/src/libretexts2zim/css.py +++ /dev/null @@ -1,143 +0,0 @@ -from collections.abc import Iterable -from pathlib import Path -from urllib.parse import urljoin, urlparse - -from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore -from tinycss2.serializer import serialize_url # pyright: ignore - -from libretexts2zim.utils import get_asset_path_from_url - -OriginalUrl = str -FullZimPath = Path -RelativeCssPath = Path - - -class CssProcessor: - """Utility to to process CSS, extract assets and rewrite URLs - - This utility can process multiple CSS documents that will be stored in a ZIM - It extracts the list of assets (images, fonts) that are used in the CSS documents - and compute appropriate ZIM paths for each of them. - - Arguments: - css_target_path: "folder" where the CSS documents that will be processed will be - stored in the ZIM - css_assets_root_path: "folder" where the CSS assets referenced in the CSS - documents will be stored in the ZIM - """ - - def __init__( - self, - css_target_path: Path = Path("/content"), - css_assets_root_path: Path = Path("/content/css_assets"), - ) -> None: - self.css_target_path = css_target_path - self.css_assets_root_path = css_assets_root_path - self.css_assets: dict[OriginalUrl, FullZimPath] = {} - self.used_paths: list[RelativeCssPath] = [] - - def process(self, css_original_url: str, css_content: bytes) -> str: - """Rewrite CSS rules and update list of assets to fetch - - This function updates the CSS rules to target assets path inside the ZIM - It also updates the list of `css_assets` which is the list of online resources - referenced inside the ZIM and which should be fetched and stored inside the ZIM - for proper CSS operation. - """ - rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType] - css_content - ) - self._process_list( - css_original_url, - rules, # pyright: ignore[reportUnknownArgumentType] - ) - return serialize( - [ - rule - for rule in rules # pyright: ignore[reportUnknownVariableType] - if not isinstance(rule, ast.ParseError) - ] - ) - - def _process_url( - self, css_original_url: str, css_url: str - ) -> RelativeCssPath | None: - """Process a URL which has been found in CSS rules - - - Transforms the URL into a ZIM path - - Updates the list of assets to retrieve - """ - original_url = urljoin(css_original_url, css_url) - original_url_parsed = urlparse(original_url) - if original_url_parsed.scheme.lower() not in ["http", "https"]: - return None - if original_url in self.css_assets: - return self.css_assets[original_url].relative_to(self.css_target_path) - relative_path = get_asset_path_from_url(original_url, self.used_paths) - self.used_paths.append(relative_path) - target_path = self.css_assets_root_path / relative_path - self.css_assets[original_url] = target_path - return target_path.relative_to(self.css_target_path) - - def _process_node(self, css_original_url: str, node: ast.Node): - """Process one single CSS node""" - if isinstance( - node, - ast.QualifiedRule - | ast.SquareBracketsBlock - | ast.ParenthesesBlock - | ast.CurlyBracketsBlock, - ): - self._process_list( - css_original_url, - node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType] - ) - elif isinstance(node, ast.FunctionBlock): - if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType] - url_node: ast.Node = node.arguments[0] # pyright: ignore - relative_css_path = self._process_url( - css_original_url, - url_node.value, # pyright: ignore - ) - if not relative_css_path: - return - url_node.value = str(relative_css_path) # pyright: ignore - url_node.representation = ( # pyright: ignore - f'"{serialize_url(str(relative_css_path))}"' - ) - - else: - self._process_list( - css_original_url, - node.arguments, # pyright: ignore - ) - elif isinstance(node, ast.AtRule): - self._process_list( - css_original_url, - node.prelude, # pyright: ignore - ) - self._process_list( - css_original_url, - node.content, # pyright: ignore - ) - elif isinstance(node, ast.Declaration): - self._process_list( - css_original_url, - node.value, # pyright: ignore - ) - elif isinstance(node, ast.URLToken): - relative_css_path = self._process_url( - css_original_url, - node.value, # pyright: ignore - ) - if not relative_css_path: - return - node.value = str(relative_css_path) - node.representation = f"url({serialize_url(str(relative_css_path))})" - - def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None): - """Process a list of CSS nodes""" - if not nodes: - return - for node in nodes: - self._process_node(css_original_url, node) diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index dfc0265..928bd01 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -10,6 +10,12 @@ stream_file, # pyright: ignore[reportUnknownVariableType] ) from zimscraperlib.image import resize_image +from zimscraperlib.rewriting.css import CssRewriter +from zimscraperlib.rewriting.url_rewriting import ( + ArticleUrlRewriter, + HttpUrl, + ZimPath, +) from zimscraperlib.zim import Creator from zimscraperlib.zim.filesystem import validate_zimfile_creatable from zimscraperlib.zim.indexing import IndexData @@ -22,7 +28,6 @@ LibreTextsMetadata, ) from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger -from libretexts2zim.css import CssProcessor from libretexts2zim.ui import ( ConfigModel, PageContentModel, @@ -307,44 +312,62 @@ def run(self) -> Path: add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image - css_processor = CssProcessor() + items_to_download: dict[ZimPath, HttpUrl] = {} screen_css = BytesIO() stream_file(home.screen_css_url, byte_stream=screen_css) - result = css_processor.process( - css_original_url=home.screen_css_url, css_content=screen_css.getvalue() + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl(home.screen_css_url), + article_path=ZimPath("screen.css"), ) + css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) + result = css_rewriter.rewrite(content=screen_css.getvalue()) + items_to_download = {**items_to_download, **url_rewriter.items_to_download} add_item_for(creator, "content/screen.css", content=result) del screen_css + del css_rewriter + del url_rewriter print_css = BytesIO() stream_file(home.print_css_url, byte_stream=print_css) - result = css_processor.process( - css_original_url=home.print_css_url, css_content=print_css.getvalue() + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl(home.print_css_url), + article_path=ZimPath("print.css"), ) + css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) + result = css_rewriter.rewrite(content=print_css.getvalue()) + items_to_download = {**items_to_download, **url_rewriter.items_to_download} add_item_for(creator, "content/print.css", content=result) del print_css + del css_rewriter + del url_rewriter - result = css_processor.process( - css_original_url=home.home_url, - css_content=("\n".join(home.inline_css)).encode(), + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl(home.home_url), article_path=ZimPath("inline.css") ) + css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) + result = css_rewriter.rewrite(content=("\n".join(home.inline_css))) + items_to_download = {**items_to_download, **url_rewriter.items_to_download} add_item_for(creator, "content/inline.css", content=result) - logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...") - for asset_url, asset_path in css_processor.css_assets.items(): + logger.info(f" Retrieving {len(items_to_download)} CSS assets...") + for asset_path, asset_url in items_to_download.items(): try: css_asset = BytesIO() - stream_file(asset_url, byte_stream=css_asset) + stream_file(asset_url.value, byte_stream=css_asset) + logger.debug( + f"Adding {asset_url.value} to {asset_path.value} in the ZIM" + ) add_item_for( - creator, str(asset_path)[1:], content=css_asset.getvalue() + creator, + "content/" + asset_path.value, + content=css_asset.getvalue(), ) - logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM") del css_asset except HTTPError as exc: # would make more sense to be a warning, but this is just too # verbose, at least on geo.libretexts.org many assets are just # missing - logger.debug(f"Ignoring {asset_path} due to {exc}") + logger.debug(f"Ignoring {asset_path.value} due to {exc}") logger.info("Fetching pages tree") pages_tree = self.libretexts_client.get_page_tree() diff --git a/scraper/tests/test_css.py b/scraper/tests/test_css.py deleted file mode 100644 index b3d7f5e..0000000 --- a/scraper/tests/test_css.py +++ /dev/null @@ -1,310 +0,0 @@ -from pathlib import Path - -import pytest - -from libretexts2zim.css import CssProcessor - - -@pytest.mark.parametrize( - "css_document_content, css_document_url, expected_assets, expected_css_rewritten", - [ - pytest.param( - """ -body { - background-image: url('https://example.com/image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, - """ -body { - background-image: url("css_assets/image.jpg"); -} -""", - id="basic_full", - ), - pytest.param( - """ -body { - background-image: url('/assets/image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - { - "https://www.acme.com/assets/image.jpg": Path( - "/content/css_assets/assets/image.jpg" - ) - }, - """ -body { - background-image: url("css_assets/assets/image.jpg"); -} -""", - id="basic_absolute", - ), - pytest.param( - """ -body { - background-image: url('../image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - {"https://www.acme.com/image.jpg": Path("/content/css_assets/image.jpg")}, - """ -body { - background-image: url("css_assets/image.jpg"); -} -""", - id="basic_relative1", - ), - pytest.param( - """ -body { - background-image: url('./image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - { - "https://www.acme.com/styles/image.jpg": Path( - "/content/css_assets/styles/image.jpg" - ) - }, - """ -body { - background-image: url("css_assets/styles/image.jpg"); -} -""", - id="basic_relative2", - ), - pytest.param( - """ -@import url("print.css") -""", - "https://www.acme.com/styles/main.css", - { - "https://www.acme.com/styles/print.css": Path( - "/content/css_assets/styles/print.css" - ) - }, - """ -@import url("css_assets/styles/print.css") -;""", - id="import", - ), - pytest.param( - """ -body { - background-image: url('https://example.com/image.jpg'), url('/assets/image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - { - "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), - "https://www.acme.com/assets/image.jpg": Path( - "/content/css_assets/assets/image.jpg" - ), - }, - """ -body { - background-image: url("css_assets/image.jpg"), url("css_assets/assets/image.jpg"); -} -""", - id="two_backgrounds", - ), - pytest.param( - """ -.ui-widget-content { - background: #fff url("https://example.com/banner2.png") 50% 50% repeat-x; - color: #222; -} -""", - "https://www.acme.com/styles/main.css", - { - "https://example.com/banner2.png": Path( - "/content/css_assets/banner2.png" - ), - }, - """ -.ui-widget-content { - background: #fff url("css_assets/banner2.png") 50% 50% repeat-x; - color: #222; -} -""", - id="complex_1", - ), - pytest.param( - """ -@font-face { - font-display: swap; - font-family: icomoon; - font-style: normal; - font-weight: 400; - src: url(/@style/icons/icomoon.eot?_=ae123bc); - src: url(/@style/icons/icomoon.eot?_=ae123bc#iefix) - format("embedded-opentype"), - url(/@style/icons/icomoon.woff?_=ae123bc) - format("woff"), - url(/@style/icons/icomoon.ttf?_=ae123bc) - format("truetype"), - url(/@style/icons/icomoon.svg?_=ae123bc#icomoon) - format("svg"); -} -""", - "https://www.acme.com/styles/main.css", - { - "https://www.acme.com/@style/icons/icomoon.eot?_=ae123bc": Path( - "/content/css_assets/@style/icons/icomoon.eot" - ), - "https://www.acme.com/@style/icons/icomoon.eot?_=ae123bc#iefix": Path( - "/content/css_assets/@style/icons/icomoon_1.eot" - ), - "https://www.acme.com/@style/icons/icomoon.woff?_=ae123bc": Path( - "/content/css_assets/@style/icons/icomoon.woff" - ), - "https://www.acme.com/@style/icons/icomoon.ttf?_=ae123bc": Path( - "/content/css_assets/@style/icons/icomoon.ttf" - ), - "https://www.acme.com/@style/icons/icomoon.svg?_=ae123bc#icomoon": Path( - "/content/css_assets/@style/icons/icomoon.svg" - ), - }, - """ -@font-face { - font-display: swap; - font-family: icomoon; - font-style: normal; - font-weight: 400; - src: url(css_assets/@style/icons/icomoon.eot); - src: url(css_assets/@style/icons/icomoon_1.eot) - format("embedded-opentype"), - url(css_assets/@style/icons/icomoon.woff) - format("woff"), - url(css_assets/@style/icons/icomoon.ttf) - format("truetype"), - url(css_assets/@style/icons/icomoon.svg) - format("svg"); -} -""", - id="complex_2", - ), - pytest.param( - """ -body { - background-image: url('https://example.com/image.jpg'); -} -div { - background-image: url('https://example.com/image.jpg'); -} -""", - "https://www.acme.com/styles/main.css", - {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, - """ -body { - background-image: url("css_assets/image.jpg"); -} -div { - background-image: url("css_assets/image.jpg"); -} -""", - id="duplicate", - ), - pytest.param( - """ -.magicBg { -background-image: url() -} -""", - "https://www.acme.com/styles/main.css", - {}, - """ -.magicBg { -background-image: url() -} -""", - id="ignore_data", - ), - pytest.param( - """ -div { - background-image: url('https://example.com/image.jpg'); -} -}/*]]>*/ -""", - "https://www.acme.com/styles/main.css", - {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, - """ -div { - background-image: url("css_assets/image.jpg"); -} -""", - id="ignore_parsing_error", - ), - ], -) -def test_css_processor_single_doc( - css_document_content: str, - css_document_url: str, - expected_assets: dict[str, Path], - expected_css_rewritten: str, -): - processor = CssProcessor() - result = processor.process(css_document_url, css_document_content.encode()) - assert processor.css_assets == expected_assets - assert result == expected_css_rewritten - - -def test_css_processor_multiple_docs(): - doc1 = """ -body { - background-image: url('https://example.com/image.jpg'), url('https://example.com/image.jpg?_=test1'); -} -""" - doc2 = """ -div { - background-image: url('https://example.com/image.jpg'), url('https://example.com/image.jpg?_=test2'); -} -""" - css_1_url = "https://www.acme.com/styles/main1.css" - css_2_url = "https://www.acme.com/styles/main2.css" - processor = CssProcessor() - - # process a first document - result1 = processor.process(css_original_url=css_1_url, css_content=doc1.encode()) - - assert processor.css_assets == { - "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), - "https://example.com/image.jpg?_=test1": Path( - "/content/css_assets/image_1.jpg" - ), - } - - assert ( - result1 - == """ -body { - background-image: url("css_assets/image.jpg"), url("css_assets/image_1.jpg"); -} -""" - ) - - # process a second document - result2 = processor.process(css_original_url=css_2_url, css_content=doc2.encode()) - - assert processor.css_assets == { - "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), - "https://example.com/image.jpg?_=test1": Path( - "/content/css_assets/image_1.jpg" - ), - "https://example.com/image.jpg?_=test2": Path( - "/content/css_assets/image_2.jpg" - ), - } - - assert ( - result2 - == """ -div { - background-image: url("css_assets/image.jpg"), url("css_assets/image_2.jpg"); -} -""" - ) From 2bd3c61fc2256d5da02b8f9f470b3a1429a73eea Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 22 Oct 2024 17:37:11 +0200 Subject: [PATCH 2/2] Mutualize code between CSS stylesheets --- scraper/src/libretexts2zim/processor.py | 83 ++++++++++++++----------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 928bd01..4726cd2 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -312,45 +312,26 @@ def run(self) -> Path: add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image - items_to_download: dict[ZimPath, HttpUrl] = {} - screen_css = BytesIO() - stream_file(home.screen_css_url, byte_stream=screen_css) - url_rewriter = ArticleUrlRewriter( - article_url=HttpUrl(home.screen_css_url), - article_path=ZimPath("screen.css"), + self.items_to_download: dict[ZimPath, HttpUrl] = {} + self._process_css( + css_location=home.screen_css_url, + target_filename="screen.css", + creator=creator, ) - css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) - result = css_rewriter.rewrite(content=screen_css.getvalue()) - items_to_download = {**items_to_download, **url_rewriter.items_to_download} - add_item_for(creator, "content/screen.css", content=result) - del screen_css - del css_rewriter - del url_rewriter - - print_css = BytesIO() - stream_file(home.print_css_url, byte_stream=print_css) - url_rewriter = ArticleUrlRewriter( - article_url=HttpUrl(home.print_css_url), - article_path=ZimPath("print.css"), + self._process_css( + css_location=home.print_css_url, + target_filename="print.css", + creator=creator, ) - css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) - result = css_rewriter.rewrite(content=print_css.getvalue()) - items_to_download = {**items_to_download, **url_rewriter.items_to_download} - add_item_for(creator, "content/print.css", content=result) - del print_css - del css_rewriter - del url_rewriter - - url_rewriter = ArticleUrlRewriter( - article_url=HttpUrl(home.home_url), article_path=ZimPath("inline.css") + self._process_css( + css_location=home.home_url, + css_content="\n".join(home.inline_css), + target_filename="inline.css", + creator=creator, ) - css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) - result = css_rewriter.rewrite(content=("\n".join(home.inline_css))) - items_to_download = {**items_to_download, **url_rewriter.items_to_download} - add_item_for(creator, "content/inline.css", content=result) - logger.info(f" Retrieving {len(items_to_download)} CSS assets...") - for asset_path, asset_url in items_to_download.items(): + logger.info(f" Retrieving {len(self.items_to_download)} CSS assets...") + for asset_path, asset_url in self.items_to_download.items(): try: css_asset = BytesIO() stream_file(asset_url.value, byte_stream=css_asset) @@ -402,3 +383,35 @@ def run(self) -> Path: ) return zim_path + + def _process_css( + self, + creator: Creator, + target_filename: str, + css_location: str, + css_content: str | bytes | None = None, + ): + """Process a given CSS stylesheet + Download content if necessary, rewrite CSS and add CSS to ZIM + """ + if not css_location: + raise ValueError(f"Cannot process empty css_location for {target_filename}") + if not css_content: + css_buffer = BytesIO() + stream_file(css_location, byte_stream=css_buffer) + css_content = css_buffer.getvalue() + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl(css_location), + article_path=ZimPath(target_filename), + ) + css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None) + result = css_rewriter.rewrite(content=css_content) + # Rebuild the dict since we might have "conflict" of ZimPath (two urls leading + # to the same ZimPath) and we prefer to use the first URL encountered, where + # using self.items_to_download.update while override the key value, prefering + # to use last URL encountered. + self.items_to_download = { + **self.items_to_download, + **url_rewriter.items_to_download, + } + add_item_for(creator, f"content/{target_filename}", content=result)