Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to content rewriting code from zimscraperlib #33

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ readme = "../README.md"
dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
"jinja2==3.1.4",
"zimscraperlib==4.0.0",
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"requests==2.32.3",
"types-requests==2.32.0.20240914",
"kiwixstorage==0.9.0",
Expand Down
143 changes: 0 additions & 143 deletions scraper/src/libretexts2zim/css.py

This file was deleted.

88 changes: 62 additions & 26 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
stream_file, # pyright: ignore[reportUnknownVariableType]
)
from zimscraperlib.image import resize_image
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
ZimPath,
)
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData
Expand All @@ -22,7 +28,6 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -307,44 +312,43 @@
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
self.items_to_download: dict[ZimPath, HttpUrl] = {}
self._process_css(

Check warning on line 316 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L315-L316

Added lines #L315 - L316 were not covered by tests
css_location=home.screen_css_url,
target_filename="screen.css",
creator=creator,
)
add_item_for(creator, "content/screen.css", content=result)
del screen_css

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(
css_original_url=home.print_css_url, css_content=print_css.getvalue()
self._process_css(

Check warning on line 321 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L321

Added line #L321 was not covered by tests
css_location=home.print_css_url,
target_filename="print.css",
creator=creator,
)
add_item_for(creator, "content/print.css", content=result)
del print_css

result = css_processor.process(
css_original_url=home.home_url,
css_content=("\n".join(home.inline_css)).encode(),
self._process_css(

Check warning on line 326 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L326

Added line #L326 was not covered by tests
css_location=home.home_url,
css_content="\n".join(home.inline_css),
target_filename="inline.css",
creator=creator,
)
add_item_for(creator, "content/inline.css", content=result)

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")
for asset_url, asset_path in css_processor.css_assets.items():
logger.info(f" Retrieving {len(self.items_to_download)} CSS assets...")

Check warning on line 333 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L333

Added line #L333 was not covered by tests
for asset_path, asset_url in self.items_to_download.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
stream_file(asset_url.value, byte_stream=css_asset)
logger.debug(

Check warning on line 338 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L337-L338

Added lines #L337 - L338 were not covered by tests
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
add_item_for(
creator, str(asset_path)[1:], content=css_asset.getvalue()
creator,
"content/" + asset_path.value,
content=css_asset.getvalue(),
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

Check warning on line 351 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L351

Added line #L351 was not covered by tests

logger.info("Fetching pages tree")
pages_tree = self.libretexts_client.get_page_tree()
Expand Down Expand Up @@ -379,3 +383,35 @@
)

return zim_path

def _process_css(
self,
creator: Creator,
target_filename: str,
css_location: str,
css_content: str | bytes | None = None,
):
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")

Check warning on line 398 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L398

Added line #L398 was not covered by tests
if not css_content:
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
css_content = css_buffer.getvalue()
url_rewriter = ArticleUrlRewriter(

Check warning on line 403 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L400-L403

Added lines #L400 - L403 were not covered by tests
article_url=HttpUrl(css_location),
article_path=ZimPath(target_filename),
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
result = css_rewriter.rewrite(content=css_content)

Check warning on line 408 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L407-L408

Added lines #L407 - L408 were not covered by tests
# Rebuild the dict since we might have "conflict" of ZimPath (two urls leading
# to the same ZimPath) and we prefer to use the first URL encountered, where
# using self.items_to_download.update while override the key value, prefering
# to use last URL encountered.
self.items_to_download = {

Check warning on line 413 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L413

Added line #L413 was not covered by tests
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
**self.items_to_download,
**url_rewriter.items_to_download,
}
add_item_for(creator, f"content/{target_filename}", content=result)

Check warning on line 417 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L417

Added line #L417 was not covered by tests
Loading