Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite HTML: <a> and <img> tags #39

Merged
merged 5 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
"jinja2==3.1.4",
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@mindtouch_changes",
"requests==2.32.3",
"types-requests==2.32.0.20240914",
"kiwixstorage==0.9.0",
Expand Down
9 changes: 6 additions & 3 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@
# Client configuration flags
parser.add_argument(
"--library-url",
help="URL of the Mindtouch / Nice CXone Expert instance, e.g. for LibreTexts "
"Geosciences it is https://geo.libretexts.org/",
help="URL of the Mindtouch / Nice CXone Expert instance (must NOT contain "
"trailing slash), e.g. for LibreTexts Geosciences it is "
"https://geo.libretexts.org",
required=True,
)

Expand Down Expand Up @@ -217,6 +218,8 @@
tmp_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(tmp_folder, "test.txt")

library_url = str(args.library_url).rstrip("/")

Check warning on line 221 in scraper/src/mindtouch2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/entrypoint.py#L221

Added line #L221 was not covered by tests

try:
zim_config = ZimConfig.of(args)
doc_filter = ContentFilter.of(args)
Expand All @@ -225,7 +228,7 @@
cache_folder.mkdir(exist_ok=True)

mindtouch_client = MindtouchClient(
library_url=args.library_url,
library_url=library_url,
cache_folder=cache_folder,
)

Expand Down
246 changes: 211 additions & 35 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
)
from zimscraperlib.image import resize_image
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.html import HtmlRewriter
from zimscraperlib.rewriting.html import rules as html_rules
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
RewriteResult,
ZimPath,
)
from zimscraperlib.zim import Creator
Expand Down Expand Up @@ -48,6 +51,12 @@
pass


class UnsupportedTagError(Exception):
"""An exception raised when an HTML tag is not expected to be encountered"""

pass


class ContentFilter(BaseModel):
"""Supports filtering documents by user provided attributes."""

Expand Down Expand Up @@ -313,7 +322,7 @@
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

self.items_to_download: dict[ZimPath, HttpUrl] = {}
self.items_to_download: dict[ZimPath, set[HttpUrl]] = {}

Check warning on line 325 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L325

Added line #L325 was not covered by tests
self._process_css(
css_location=home.screen_css_url,
target_filename="screen.css",
Expand All @@ -331,26 +340,6 @@
creator=creator,
)

logger.info(f" Retrieving {len(self.items_to_download)} CSS assets...")
for asset_path, asset_url in self.items_to_download.items():
try:
css_asset = BytesIO()
stream_file(asset_url.value, byte_stream=css_asset)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
add_item_for(
creator,
"content/" + asset_path.value,
content=css_asset.getvalue(),
)
del css_asset
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

logger.info("Fetching pages tree")
pages_tree = self.mindtouch_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
Expand All @@ -372,17 +361,40 @@
)

logger.info("Fetching pages content")
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
existing_html_pages = {

Check warning on line 366 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L366

Added line #L366 was not covered by tests
ArticleUrlRewriter.normalize(
HttpUrl(f"{self.mindtouch_client.library_url}/{page.path}")
)
for page in selected_pages
}
for page in selected_pages:
logger.debug(f" Fetching {page.id}")
page_content = self.mindtouch_client.get_page_content(page)
add_item_for(
creator,
f"content/page_content_{page.id}.json",
content=PageContentModel(
html_body=page_content.html_body
).model_dump_json(by_alias=True),
self._process_page(

Check warning on line 373 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L373

Added line #L373 was not covered by tests
creator=creator, page=page, existing_zim_paths=existing_html_pages
)

logger.info(f" Retrieving {len(self.items_to_download)} assets...")

Check warning on line 377 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L377

Added line #L377 was not covered by tests
for asset_path, asset_urls in self.items_to_download.items():
for asset_url in asset_urls:
try:
asset_content = BytesIO()
stream_file(asset_url.value, byte_stream=asset_content)
logger.debug(

Check warning on line 383 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L380-L383

Added lines #L380 - L383 were not covered by tests
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
add_item_for(

Check warning on line 386 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L386

Added line #L386 was not covered by tests
creator,
"content/" + asset_path.value,
content=asset_content.getvalue(),
)
break # file found and added
except HTTPError as exc:

Check warning on line 392 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L391-L392

Added lines #L391 - L392 were not covered by tests
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

Check warning on line 396 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L396

Added line #L396 was not covered by tests

return zim_path

def _process_css(
Expand All @@ -401,18 +413,182 @@
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
css_content = css_buffer.getvalue()
url_rewriter = ArticleUrlRewriter(
url_rewriter = CssUrlsRewriter(

Check warning on line 416 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L416

Added line #L416 was not covered by tests
article_url=HttpUrl(css_location),
article_path=ZimPath(target_filename),
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
css_rewriter = CssRewriter(

Check warning on line 420 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L420

Added line #L420 was not covered by tests
url_rewriter=url_rewriter, base_href=None, remove_errors=True
)
result = css_rewriter.rewrite(content=css_content)
# Rebuild the dict since we might have "conflict" of ZimPath (two urls leading
# to the same ZimPath) and we prefer to use the first URL encountered, where
# using self.items_to_download.update while override the key value, prefering
# to use last URL encountered.
self.items_to_download = {
**self.items_to_download,
**url_rewriter.items_to_download,
}
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].update(urls)

Check warning on line 430 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L430

Added line #L430 was not covered by tests
else:
self.items_to_download[path] = urls

Check warning on line 432 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L432

Added line #L432 was not covered by tests
add_item_for(creator, f"content/{target_filename}", content=result)

def _process_page(
self, creator: Creator, page: LibraryPage, existing_zim_paths: set[ZimPath]
):
"""Process a given library page
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(

Check warning on line 443 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L441-L443

Added lines #L441 - L443 were not covered by tests
self.mindtouch_client.library_url,
page,
existing_zim_paths=existing_zim_paths,
)
rewriter = HtmlRewriter(

Check warning on line 448 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L448

Added line #L448 was not covered by tests
url_rewriter=url_rewriter,
pre_head_insert=None,
post_head_insert=None,
notify_js_module=None,
)
rewriten = rewriter.rewrite(page_content.html_body)

Check warning on line 454 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L454

Added line #L454 was not covered by tests
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].update(urls)

Check warning on line 457 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L457

Added line #L457 was not covered by tests
else:
self.items_to_download[path] = urls
add_item_for(

Check warning on line 460 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L459-L460

Added lines #L459 - L460 were not covered by tests
creator,
f"content/page_content_{page.id}.json",
content=PageContentModel(html_body=rewriten.content).model_dump_json(
by_alias=True
),
)


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
html_rules.rewrite_data_rules.clear()
html_rules.rewrite_tag_rules.clear()


@html_rules.rewrite_attribute()
def rewrite_href_src_attributes(
tag: str,
attr_name: str,
attr_value: str | None,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
):
"""Rewrite href and src attributes"""
if attr_name not in ("href", "src") or not attr_value:
return

Check warning on line 485 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L485

Added line #L485 was not covered by tests
if not isinstance(url_rewriter, HtmlUrlsRewriter):
raise Exception("Expecting MindtouchUrlRewriter")
new_attr_value = None

Check warning on line 488 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L487-L488

Added lines #L487 - L488 were not covered by tests
if tag == "a":
rewrite_result = url_rewriter(

Check warning on line 490 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L490

Added line #L490 was not covered by tests
attr_value, base_href=base_href, rewrite_all_url=False
)
# rewrite links for proper navigation inside ZIM Vue.JS UI (if inside ZIM) or
# full link (if outside the current library)
new_attr_value = (

Check warning on line 495 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L495

Added line #L495 was not covered by tests
f"#/{rewrite_result.rewriten_url[len(url_rewriter.library_path.value) :]}"
if rewrite_result.rewriten_url.startswith(url_rewriter.library_path.value)
else rewrite_result.rewriten_url
)
if tag == "img":
rewrite_result = url_rewriter(

Check warning on line 501 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L501

Added line #L501 was not covered by tests
attr_value, base_href=base_href, rewrite_all_url=True
)
# add 'content/' to the URL since all assets will be stored in the sub.-path
new_attr_value = f"content/{rewrite_result.rewriten_url}"

Check warning on line 505 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L505

Added line #L505 was not covered by tests
if rewrite_result.zim_path is not None:
# if item is expected to be inside the ZIM, store asset information so that
# we can download it afterwards
if rewrite_result.zim_path in url_rewriter.items_to_download:
url_rewriter.items_to_download[rewrite_result.zim_path].add(

Check warning on line 510 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L510

Added line #L510 was not covered by tests
HttpUrl(rewrite_result.absolute_url)
)
else:
url_rewriter.items_to_download[rewrite_result.zim_path] = {

Check warning on line 514 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L514

Added line #L514 was not covered by tests
HttpUrl(rewrite_result.absolute_url)
}
if not new_attr_value:
# we do not (yet) support other tags / attributes so we fail the scraper
raise ValueError(

Check warning on line 519 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L519

Added line #L519 was not covered by tests
f"Empty new value when rewriting {attr_value} from {attr_name} in {tag} tag"
)
return (attr_name, new_attr_value)

Check warning on line 522 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L522

Added line #L522 was not covered by tests


@html_rules.drop_attribute()
def drop_sizes_and_srcset_attribute(tag: str, attr_name: str):
"""Drop srcset and sizes attributes in <img> tags"""
return tag == "img" and attr_name in ("srcset", "sizes")

Check warning on line 528 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L528

Added line #L528 was not covered by tests


@html_rules.rewrite_tag()
def refuse_unsupported_tags(tag: str):
"""Stop scraper if unsupported tag is encountered"""
if tag not in ["picture"]:
return
raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper")

Check warning on line 536 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L535-L536

Added lines #L535 - L536 were not covered by tests


class HtmlUrlsRewriter(ArticleUrlRewriter):
"""A rewriter for HTML processing

This rewriter does not store items to download on-the-fly but has containers and
metadata so that HTML rewriting rules can decide what needs to be downloaded
"""

def __init__(
self, library_url: str, page: LibraryPage, existing_zim_paths: set[ZimPath]
):
super().__init__(

Check warning on line 549 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L549

Added line #L549 was not covered by tests
article_url=HttpUrl(f"{library_url}/{page.path}"),
article_path=ZimPath("index.html"),
existing_zim_paths=existing_zim_paths,
)
self.library_url = library_url
self.library_path = ArticleUrlRewriter.normalize(HttpUrl(f"{library_url}/"))
self.items_to_download: dict[ZimPath, set[HttpUrl]] = {}

Check warning on line 556 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L554-L556

Added lines #L554 - L556 were not covered by tests

def __call__(
self, item_url: str, base_href: str | None, *, rewrite_all_url: bool = True
) -> RewriteResult:
result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url)
return result

Check warning on line 562 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L561-L562

Added lines #L561 - L562 were not covered by tests


class CssUrlsRewriter(ArticleUrlRewriter):
"""A rewriter for CSS processing, storing items to download as URL as processed"""

def __init__(
self,
*,
article_url: HttpUrl,
article_path: ZimPath,
):
super().__init__(

Check warning on line 574 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L574

Added line #L574 was not covered by tests
article_url=article_url,
article_path=article_path,
)
self.items_to_download: dict[ZimPath, set[HttpUrl]] = {}

Check warning on line 578 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L578

Added line #L578 was not covered by tests

def __call__(
self,
item_url: str,
base_href: str | None,
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> RewriteResult:
result = super().__call__(item_url, base_href, rewrite_all_url=True)

Check warning on line 587 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L587

Added line #L587 was not covered by tests
if result.zim_path is None:
return result

Check warning on line 589 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L589

Added line #L589 was not covered by tests
if result.zim_path in self.items_to_download:
self.items_to_download[result.zim_path].add(HttpUrl(result.absolute_url))

Check warning on line 591 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L591

Added line #L591 was not covered by tests
else:
self.items_to_download[result.zim_path] = {HttpUrl(result.absolute_url)}
return result

Check warning on line 594 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L593-L594

Added lines #L593 - L594 were not covered by tests