Skip to content

Commit

Permalink
Merge pull request #33 from openzim/add_content_rewriting
Browse files Browse the repository at this point in the history
Move to content rewriting code from zimscraperlib
  • Loading branch information
benoit74 authored Oct 24, 2024
2 parents 797edd3 + 2bd3c61 commit 018982a
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 480 deletions.
3 changes: 2 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ readme = "../README.md"
dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
"jinja2==3.1.4",
"zimscraperlib==4.0.0",
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"requests==2.32.3",
"types-requests==2.32.0.20240914",
"kiwixstorage==0.9.0",
Expand Down
143 changes: 0 additions & 143 deletions scraper/src/libretexts2zim/css.py

This file was deleted.

88 changes: 62 additions & 26 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
stream_file, # pyright: ignore[reportUnknownVariableType]
)
from zimscraperlib.image import resize_image
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
ZimPath,
)
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData
Expand All @@ -22,7 +28,6 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -307,44 +312,43 @@ def run(self) -> Path:
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
self.items_to_download: dict[ZimPath, HttpUrl] = {}
self._process_css(
css_location=home.screen_css_url,
target_filename="screen.css",
creator=creator,
)
add_item_for(creator, "content/screen.css", content=result)
del screen_css

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(
css_original_url=home.print_css_url, css_content=print_css.getvalue()
self._process_css(
css_location=home.print_css_url,
target_filename="print.css",
creator=creator,
)
add_item_for(creator, "content/print.css", content=result)
del print_css

result = css_processor.process(
css_original_url=home.home_url,
css_content=("\n".join(home.inline_css)).encode(),
self._process_css(
css_location=home.home_url,
css_content="\n".join(home.inline_css),
target_filename="inline.css",
creator=creator,
)
add_item_for(creator, "content/inline.css", content=result)

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")
for asset_url, asset_path in css_processor.css_assets.items():
logger.info(f" Retrieving {len(self.items_to_download)} CSS assets...")
for asset_path, asset_url in self.items_to_download.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
stream_file(asset_url.value, byte_stream=css_asset)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
add_item_for(
creator, str(asset_path)[1:], content=css_asset.getvalue()
creator,
"content/" + asset_path.value,
content=css_asset.getvalue(),
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

logger.info("Fetching pages tree")
pages_tree = self.libretexts_client.get_page_tree()
Expand Down Expand Up @@ -379,3 +383,35 @@ def run(self) -> Path:
)

return zim_path

def _process_css(
self,
creator: Creator,
target_filename: str,
css_location: str,
css_content: str | bytes | None = None,
):
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
css_content = css_buffer.getvalue()
url_rewriter = ArticleUrlRewriter(
article_url=HttpUrl(css_location),
article_path=ZimPath(target_filename),
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
result = css_rewriter.rewrite(content=css_content)
# Rebuild the dict since we might have "conflict" of ZimPath (two urls leading
# to the same ZimPath) and we prefer to use the first URL encountered, where
# using self.items_to_download.update while override the key value, prefering
# to use last URL encountered.
self.items_to_download = {
**self.items_to_download,
**url_rewriter.items_to_download,
}
add_item_for(creator, f"content/{target_filename}", content=result)
Loading

0 comments on commit 018982a

Please sign in to comment.