diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index 7d74402..de555d8 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -20,10 +20,12 @@ class LibreTextsParsingError(Exception): class LibreTextsHome(BaseModel): + home_url: str welcome_text_paragraphs: list[str] welcome_image_url: str screen_css_url: str print_css_url: str + inline_css: list[str] LibraryPageId = str @@ -210,6 +212,8 @@ def get_home(self) -> LibreTextsHome: welcome_image_url=_get_welcome_image_url_from_home(soup), screen_css_url=_get_screen_css_url_from_home(soup), print_css_url=_get_print_css_url_from_home(soup), + inline_css=_get_inline_css_from_home(soup), + home_url=f"{self.library_url}/", ) def get_deki_token(self) -> str: @@ -405,3 +409,9 @@ def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str: def _get_print_css_url_from_home(soup: BeautifulSoup) -> str: """Returns the URL of print CSS found on home page""" return _get_any_css_url_from_home(soup, "print") + + +def _get_inline_css_from_home(soup: BeautifulSoup) -> list[str]: + """Returns inline CSS code found on home page""" + links = soup.find_all("style", {"type": "text/css"}) + return [link.text for link in links if link.text] diff --git a/scraper/src/libretexts2zim/css.py b/scraper/src/libretexts2zim/css.py index 64e075d..f11cc9e 100644 --- a/scraper/src/libretexts2zim/css.py +++ b/scraper/src/libretexts2zim/css.py @@ -51,7 +51,13 @@ def process(self, css_original_url: str, css_content: bytes) -> str: css_original_url, rules, # pyright: ignore[reportUnknownArgumentType] ) - return serialize(rules) + return serialize( + [ + rule + for rule in rules # pyright: ignore[reportUnknownVariableType] + if not isinstance(rule, ast.ParseError) + ] + ) def _process_url( self, css_original_url: str, css_url: str diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index eafcf26..2a77519 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -286,6 +286,12 @@ def run(self) -> Path: add_item_for(creator, "content/print.css", content=result) del print_css + result = css_processor.process( + css_original_url=home.home_url, + css_content=("\n".join(home.inline_css)).encode(), + ) + add_item_for(creator, "content/inline.css", content=result) + logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...") for asset_url, asset_path in css_processor.css_assets.items(): try: diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index ca3eed2..ba90c09 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -150,3 +150,13 @@ def test_get_home_print_css_url(home: LibreTextsHome): home.print_css_url == "https://a.mtstatic.com/@cache/layout/print.css?_=99d83fb44eaebe60981933ec554d138d:site_4038" ) + + +def test_get_home_inline_css(home: LibreTextsHome): + """Ensures proper print CSS url is retrieved""" + assert len(home.inline_css) >= 10 # 13 expected as of Oct. 2024 + assert len("\n".join(home.inline_css)) >= 35000 # 39843 expected as of Oct. 2024 + + +def test_get_home_url(home: LibreTextsHome, libretexts_url: str): + assert home.home_url == f"{libretexts_url}/" diff --git a/scraper/tests/test_css.py b/scraper/tests/test_css.py index e6dcc0b..b3d7f5e 100644 --- a/scraper/tests/test_css.py +++ b/scraper/tests/test_css.py @@ -223,6 +223,22 @@ """, id="ignore_data", ), + pytest.param( + """ +div { + background-image: url('https://example.com/image.jpg'); +} +}/*]]>*/ +""", + "https://www.acme.com/styles/main.css", + {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, + """ +div { + background-image: url("css_assets/image.jpg"); +} +""", + id="ignore_parsing_error", + ), ], ) def test_css_processor_single_doc( diff --git a/zimui/index.html b/zimui/index.html index 4fccf60..90659e5 100644 --- a/zimui/index.html +++ b/zimui/index.html @@ -5,6 +5,7 @@ +