Skip to content

Commit

Permalink
Merge pull request #41 from openzim/title_favicon
Browse files Browse the repository at this point in the history
Handle pages title and favicon
  • Loading branch information
benoit74 authored Oct 28, 2024
2 parents e03c789 + deebfa3 commit a732a8a
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 429 deletions.
2 changes: 2 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ yarn dev

Do not forget to cleanup `public/content` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.

Note that some assets (e.g. icomoon fonts on LibreTexts Geoscience) having a question mark in their URL are not properly working in the yarn dev server. This is OK inside the ZIM. See https://github.com/openzim/mindtouch/issues/34.

```
rm -rf zimui/public/content
```
Expand Down
13 changes: 13 additions & 0 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class MindtouchHome(BaseModel):
screen_css_url: str
print_css_url: str
inline_css: list[str]
icons_urls: list[str]


LibraryPageId = str
Expand Down Expand Up @@ -181,6 +182,7 @@ def get_home(self) -> MindtouchHome:
print_css_url=_get_print_css_url_from_home(soup),
inline_css=_get_inline_css_from_home(soup),
home_url=f"{self.library_url}/",
icons_urls=_get_icons_urls(soup),
)

def get_deki_token(self) -> str:
Expand Down Expand Up @@ -381,3 +383,14 @@ def _get_inline_css_from_home(soup: BeautifulSoup) -> list[str]:
"""Returns inline CSS code found on home page"""
links = soup.find_all("style", {"type": "text/css"})
return [link.text for link in links if link.text]


def _get_icons_urls(soup: BeautifulSoup) -> list[str]:
"""Returns list of potential icons"""
# prefer apple-touch-icon since they are usually bigger than the classic 32x32
# favicon which is ugly once upscaled to 48x48 which is what we need for the ZIM
# illustration
links = soup.find_all("link", {"rel": "apple-touch-icon"}) + soup.find_all(
"link", {"rel": "icon"}
)
return [link.get("href", None) for link in links if link.get("href", None)]
7 changes: 7 additions & 0 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ def main(tmpdir: str) -> None:
dest="stats_filename",
)

parser.add_argument(
"--illustration-url",
help="URL to illustration to use for ZIM illustration and favicon",
dest="illustration_url",
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
Expand Down Expand Up @@ -246,6 +252,7 @@ def main(tmpdir: str) -> None:
content_filter=doc_filter,
stats_file=Path(args.stats_filename) if args.stats_filename else None,
overwrite_existing_zim=args.overwrite,
illustration_url=args.illustration_url,
).run()
except SystemExit:
logger.error("Generation failed, exiting")
Expand Down
107 changes: 77 additions & 30 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
from zimscraperlib.image import resize_image
from zimscraperlib.image import convert_image, resize_image
from zimscraperlib.image.conversion import convert_svg2png
from zimscraperlib.image.probing import format_for
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.html import HtmlRewriter
from zimscraperlib.rewriting.html import rules as html_rules
Expand All @@ -30,8 +32,9 @@
LibraryPageId,
LibraryTree,
MindtouchClient,
MindtouchHome,
)
from mindtouch2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from mindtouch2zim.constants import LANGUAGE_ISO_639_3, NAME, VERSION, logger
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -59,6 +62,12 @@ class UnsupportedTagError(Exception):
pass


class NoIllustrationFoundError(Exception):
"""An exception raised when no suitable illustration has been found"""

pass


class ContentFilter(BaseModel):
"""Supports filtering documents by user provided attributes."""

Expand Down Expand Up @@ -174,6 +183,7 @@ def __init__(
output_folder: Path,
zimui_dist: Path,
stats_file: Path | None,
illustration_url: str | None,
*,
overwrite_existing_zim: bool,
) -> None:
Expand All @@ -195,6 +205,7 @@ def __init__(
self.zimui_dist = zimui_dist
self.stats_file = stats_file
self.overwrite_existing_zim = overwrite_existing_zim
self.illustration_url = illustration_url

self.stats_items_done = 0
# we add 1 more items to process so that progress is not 100% at the beginning
Expand All @@ -203,21 +214,6 @@ def __init__(
# could happen in the loop in terms of exit conditions
self.stats_items_total = 1

self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)

@staticmethod
def libretexts_newsite_path(name: str) -> Path:
"""Returns the path to name in the third_party/libretexts_newsite folder.
Raises ValueError if the resource doesn't exist.
"""
path = ROOT_DIR.joinpath("third_party", "libretexts_newsite", name)
if not path.exists():
raise ValueError(f"File not found at {path}")
return path

def run(self) -> Path:
"""Generates a zim for a single document.
Expand Down Expand Up @@ -253,15 +249,11 @@ def run(self) -> Path:

creator = Creator(zim_path, "index.html")

logger.debug("Resizing ZIM illustration")
zim_illustration = BytesIO()
resize_image(
src=self.zim_illustration_path,
dst=zim_illustration,
width=48,
height=48,
method="cover",
)
logger.info(" Fetching and storing home page...")
home = self.mindtouch_client.get_home()

logger.info(" Fetching ZIM illustration...")
zim_illustration = self._fetch_zim_illustration(home)

logger.debug("Configuring metadata")
creator.config_metadata(
Expand All @@ -278,11 +270,19 @@ def run(self) -> Path:
Scraper=f"{NAME} v{VERSION}",
Illustration_48x48_at_1=zim_illustration.getvalue(),
)
del zim_illustration

# Start creator early to detect problems early.
with creator as creator:

add_item_for(
creator,
"favicon.ico",
content=self._fetch_favicon_from_illustration(
zim_illustration
).getvalue(),
)
del zim_illustration

logger.info(" Storing configuration...")
add_item_for(
creator,
Expand Down Expand Up @@ -342,9 +342,6 @@ def run(self) -> Path:
is_front=False,
)

logger.info(" Fetching and storing home page...")
home = self.mindtouch_client.get_home()

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
Expand Down Expand Up @@ -516,6 +513,56 @@ def _report_progress(self):
}
self.stats_file.write_text(json.dumps(progress, indent=2))

def _fetch_zim_illustration(self, home: MindtouchHome) -> BytesIO:
"""Fetch ZIM illustration, convert/resize and return it"""
for icon_url in (
[self.illustration_url] if self.illustration_url else home.icons_urls
):
try:
logger.debug(f"Downloading {icon_url} illustration")
illustration_content = BytesIO()
stream_file(icon_url, byte_stream=illustration_content)
illustration_format = format_for(
illustration_content, from_suffix=False
)
png_illustration = BytesIO()
if illustration_format == "SVG":
logger.debug("Converting SVG illustration to PNG")
convert_svg2png(illustration_content, png_illustration, 48, 48)
elif illustration_format == "PNG":
png_illustration = illustration_content
else:
logger.debug(
f"Converting {illustration_format} illustration to PNG"
)
convert_image(illustration_content, png_illustration, fmt="PNG")
logger.debug("Resizing ZIM illustration")
resize_image(
src=png_illustration,
width=48,
height=48,
method="cover",
)
return png_illustration
except Exception as exc:
logger.warning(
f"Failed to retrieve illustration at {icon_url}", exc_info=exc
)
raise NoIllustrationFoundError("Failed to find a suitable illustration")

def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO:
"""Return a converted version of the illustration into favicon"""
favicon = BytesIO()
convert_image(illustration, favicon, fmt="ICO")
logger.debug("Resizing ZIM illustration")
resize_image(
src=favicon,
width=32,
height=32,
method="cover",
)
return favicon


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down
1 change: 0 additions & 1 deletion scraper/src/mindtouch2zim/third_party/README.md

This file was deleted.

Loading

0 comments on commit a732a8a

Please sign in to comment.