Skip to content

Commit

Permalink
Merge pull request #66 from openzim/glossary
Browse files Browse the repository at this point in the history
Handle libretexts.org glossary rewriting
  • Loading branch information
benoit74 authored Nov 13, 2024
2 parents f0e6cfe + 477304f commit 89e0b16
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 5 deletions.
57 changes: 57 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/glossary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from bs4 import BeautifulSoup


class GlossaryRewriteError(Exception):
"""Exception indicating a problem during glossary rewrite"""

pass


def _get_formatted_glossary_row(row) -> str:
"""Format one row as HTML"""
word = row.find("td", attrs={"data-th": "Word(s)"}).text
definition = row.find("td", attrs={"data-th": "Definition"}).text
return (
'<p class="glossaryElement">\n'
f' <span class="glossaryTerm">{word}</span>\n'
" |\n"
f' <span class="glossaryDefinition">{definition}</span>\n'
"</p>\n"
)


def rewrite_glossary(original_content: str) -> str | None:
"""Statically rewrite the glossary of libretexts.org
Only word and description columns are supported.
"""

soup = BeautifulSoup(
original_content,
"html.parser", # prefer html.parser to not add <html><body> tags
)

glossary_table = None

tables = soup.find_all("table")
if len(tables) == 0:
# looks like this glossary is not using default template ; let's rewrite as
# a normal page
return None
glossary_table = tables[-1]

tbody = glossary_table.find("tbody")
if not tbody:
raise GlossaryRewriteError("Glossary table body not found")

glossary_table.insert_after(
BeautifulSoup(
"".join([_get_formatted_glossary_row(row) for row in tbody.find_all("tr")]),
"html.parser", # prefer html.parser to not add <html><body> tags
)
)

# remove all tables and scripts
for item in soup.find_all("table") + soup.find_all("script"):
item.decompose()
return soup.prettify()
22 changes: 17 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -474,7 +475,20 @@ def _process_page(
post_head_insert=None,
notify_js_module=None,
)
rewriten = rewriter.rewrite(page_content.html_body)
if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match(
r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path
):
# glossary pages on libretexts.org, e.g. "Courses/California_State_Universi
# ty_Los_Angeles/Book:_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbr
# andt_and_Mosher)/zz:_Back_Matter/20:_Glossary", running at https://geo.li
# bretexts.org/Courses/California_State_University_Los_Angeles/Book%3A_An_I
# ntroduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)/zz%3A_Ba
# ck_Matter/20%3A_Glossary
rewriten = rewrite_glossary(page_content.html_body)
if not rewriten:
rewriten = rewriter.rewrite(page_content.html_body).content
else:
rewriten = rewriter.rewrite(page_content.html_body).content
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
Expand All @@ -484,14 +498,12 @@ def _process_page(
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
content=PageContentModel(html_body=rewriten.content).model_dump_json(
by_alias=True
),
content=PageContentModel(html_body=rewriten).model_dump_json(by_alias=True),
)
self._add_indexing_item_to_zim(
creator=creator,
title=page.title,
content=get_text(rewriten.content),
content=get_text(rewriten),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)
Expand Down
25 changes: 25 additions & 0 deletions zimui/public/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,28 @@ a[href^="https://"]:after
height: auto;
display: block;
}

/* additional CSS for libretexts.org glossary pages, reproduced here for simplificity
and maintainability (less sensitive to file move upstream)
*/

.glossaryTerm {
font-weight: bold;
}

p.glossaryElement {
font-size: 1em;
margin: 0.5rem 0 1em;
}

@media print {
.glossaryDefinition {
font-size: 0.85em;
}
p.glossaryElement {
break-inside: avoid;
margin: 0.4rem 0;
font-size: 0.9em;
line-height: 0.9em;
}
}

0 comments on commit 89e0b16

Please sign in to comment.