Skip to content

Commit

Permalink
Apply proper CSS for proper page display - step 1
Browse files Browse the repository at this point in the history
This first step takes care of CSS stylesheets which are in an external
file (two indeed, one for screen and one for print).

It does not consider inline CSS which is needed and will be handled in
a step 2.
  • Loading branch information
benoit74 committed Oct 8, 2024
1 parent 733c35a commit b6d1d52
Show file tree
Hide file tree
Showing 8 changed files with 523 additions and 7 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base.
docker build -t local-libretexts2zim .
```

Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments).
Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments).

```
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite
```

Extract interesting ZIM content and move it to `public` folder.
Expand Down
1 change: 1 addition & 0 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"beautifulsoup4==4.12.3",
"types-beautifulsoup4==4.12.0.20240907",
"lxml==5.3.0",
"tinycss2==1.3.0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
29 changes: 28 additions & 1 deletion scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception):
class LibreTextsHome(BaseModel):
welcome_text_paragraphs: list[str]
welcome_image_url: str
screen_css_url: str
print_css_url: str


LibraryPageId = str
Expand Down Expand Up @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome:
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
screen_css_url=_get_screen_css_url_from_home(soup),
print_css_url=_get_print_css_url_from_home(soup),
)

def get_deki_token(self) -> str:
Expand Down Expand Up @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
if tree["body"][1]["@target"] != "toc":
raise LibreTextsParsingError(
f"Unexpected second body element of /pages/{page.id}/contents, "
f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' "
f"@target property is '{tree['body'][1]['@target']}' while only 'toc' "
"is expected"
)
return LibraryPageContent(html_body=tree["body"][0])
Expand Down Expand Up @@ -373,3 +377,26 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token


def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str:
"""Returns the URL of any media CSS found on home page"""
links = soup.find_all("link", {"rel": "stylesheet", "media": media})

Check warning on line 384 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L384

Added line #L384 was not covered by tests
if len(links) != 1:
raise LibreTextsParsingError(

Check warning on line 386 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L386

Added line #L386 was not covered by tests
f"Failed to find {media} CSS URL in home page, {len(links)} link(s) found"
)
css_url = links[0].get("href", None)

Check warning on line 389 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L389

Added line #L389 was not covered by tests
if not css_url:
raise LibreTextsParsingError("screen CSS link has no href")
return css_url

Check warning on line 392 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L391-L392

Added lines #L391 - L392 were not covered by tests


def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of screen CSS found on home page"""
return _get_any_css_url_from_home(soup, "screen")

Check warning on line 397 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L397

Added line #L397 was not covered by tests


def _get_print_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of print CSS found on home page"""
return _get_any_css_url_from_home(soup, "print")

Check warning on line 402 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L402

Added line #L402 was not covered by tests
140 changes: 140 additions & 0 deletions scraper/src/libretexts2zim/css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from collections.abc import Iterable
from pathlib import Path
from urllib.parse import urljoin, urlparse

from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore
from tinycss2.serializer import serialize_url # pyright: ignore

OriginalUrl = str
FullZimPath = Path
RelativeCssPath = Path


class CssProcessor:
"""Utility to to process CSS, extract assets and rewrite URLs
This utility can process multiple CSS documents that will be stored in a ZIM
It extracts the list of assets (images, fonts) that are used in the CSS documents
and compute appropriate ZIM paths for each of them.
Arguments:
css_target_path: "folder" where the CSS documents that will be processed will be
stored in the ZIM
css_assets_root_path: "folder" where the CSS assets referenced in the CSS
documents will be stored in the ZIM
"""

def __init__(
self,
css_target_path: Path = Path("/content"),
css_assets_root_path: Path = Path("/content/css_assets"),
) -> None:
self.css_target_path = css_target_path
self.css_assets_root_path = css_assets_root_path
self.css_assets: dict[OriginalUrl, FullZimPath] = {}
self.used_paths: list[RelativeCssPath] = []

def process(self, css_original_url: str, css_content: bytes) -> str:
rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType]
css_content
)
self._process_list(
css_original_url,
rules, # pyright: ignore[reportUnknownArgumentType]
)
return serialize(rules)

def _process_url(
self, css_original_url: str, css_url: str
) -> RelativeCssPath | None:
original_url = urljoin(css_original_url, css_url)
original_url_parsed = urlparse(original_url)
if original_url_parsed.scheme.lower() not in ["http", "https"]:
return None
if original_url in self.css_assets:
return self.css_assets[original_url].relative_to(self.css_target_path)
original_path = Path(urlparse(original_url).path)
target_parent = Path(
*[
parent.name
for parent in reversed(original_path.parents)
if parent.name and parent.name != ".."
]
)

index = 0
while True:
relative_path = (
target_parent
/ f"{original_path.stem}{'_' + str(index) if index else ''}"
f"{original_path.suffix}"
)
if relative_path not in self.used_paths:
break
index += 1

self.used_paths.append(relative_path)
target_path = self.css_assets_root_path / relative_path
self.css_assets[original_url] = target_path
return target_path.relative_to(self.css_target_path)

def _process_node(self, css_original_url: str, node: ast.Node):
if isinstance(
node,
ast.QualifiedRule
| ast.SquareBracketsBlock
| ast.ParenthesesBlock
| ast.CurlyBracketsBlock,
):
self._process_list(
css_original_url,
node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType]
)
elif isinstance(node, ast.FunctionBlock):
if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType]
url_node: ast.Node = node.arguments[0] # pyright: ignore
relative_css_path = self._process_url(
css_original_url,
url_node.value, # pyright: ignore
)
if not relative_css_path:
return

Check warning on line 101 in scraper/src/libretexts2zim/css.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/css.py#L101

Added line #L101 was not covered by tests
url_node.value = str(relative_css_path) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(relative_css_path))}"'
)

else:
self._process_list(
css_original_url,
node.arguments, # pyright: ignore
)
elif isinstance(node, ast.AtRule):
self._process_list(
css_original_url,
node.prelude, # pyright: ignore
)
self._process_list(
css_original_url,
node.content, # pyright: ignore
)
elif isinstance(node, ast.Declaration):
self._process_list(

Check warning on line 122 in scraper/src/libretexts2zim/css.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/css.py#L122

Added line #L122 was not covered by tests
css_original_url,
node.value, # pyright: ignore
)
elif isinstance(node, ast.URLToken):
relative_css_path = self._process_url(
css_original_url,
node.value, # pyright: ignore
)
if not relative_css_path:
return
node.value = str(relative_css_path)
node.representation = f"url({serialize_url(str(relative_css_path))})"

def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None):
if not nodes:
return
for node in nodes:
self._process_node(css_original_url, node)
36 changes: 36 additions & 0 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

from pydantic import BaseModel
from requests.exceptions import HTTPError
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
Expand All @@ -21,6 +22,7 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -261,11 +263,45 @@ def run(self) -> Path:

logger.info(" Fetching and storing home page...")
home = self.libretexts_client.get_home()

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(

Check warning on line 275 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L272-L275

Added lines #L272 - L275 were not covered by tests
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
)
add_item_for(creator, "content/screen.css", content=result)
del screen_css

Check warning on line 279 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L278-L279

Added lines #L278 - L279 were not covered by tests

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(

Check warning on line 283 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L281-L283

Added lines #L281 - L283 were not covered by tests
css_original_url=home.print_css_url, css_content=print_css.getvalue()
)
add_item_for(creator, "content/print.css", content=result)
del print_css

Check warning on line 287 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L286-L287

Added lines #L286 - L287 were not covered by tests

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")

Check warning on line 289 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L289

Added line #L289 was not covered by tests
for asset_url, asset_path in css_processor.css_assets.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
add_item_for(

Check warning on line 294 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L291-L294

Added lines #L291 - L294 were not covered by tests
creator, str(asset_path)[1:], content=css_asset.getvalue()
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:

Check warning on line 299 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L297-L299

Added lines #L297 - L299 were not covered by tests
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")

Check warning on line 303 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L303

Added line #L303 was not covered by tests

logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
Expand Down
16 changes: 16 additions & 0 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,19 @@ def test_get_home_welcome_text_paragraphs(
def test_get_home_page_content(client: LibreTextsClient, page_tree: LibraryTree):
"""Ensures we can get content of root page"""
assert client.get_page_content(page_tree.root).html_body


def test_get_home_screen_css_url(home: LibreTextsHome):
"""Ensures proper screen CSS url is retrieved"""
assert (
home.screen_css_url
== "https://a.mtstatic.com/@cache/layout/anonymous.css?_=715eca8811db7abb8e6f0555936e020d_Z2VvLmxpYnJldGV4dHMub3Jn:site_4038"
)


def test_get_home_print_css_url(home: LibreTextsHome):
"""Ensures proper print CSS url is retrieved"""
assert (
home.print_css_url
== "https://a.mtstatic.com/@cache/layout/print.css?_=99d83fb44eaebe60981933ec554d138d:site_4038"
)
Loading

0 comments on commit b6d1d52

Please sign in to comment.