Skip to content

Commit

Permalink
Handle libretexts.org glossary rewriting
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 11, 2024
1 parent 6ddba30 commit 4a9c441
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 5 deletions.
62 changes: 62 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/glossary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from bs4 import BeautifulSoup


class GlossaryRewriteError(Exception):
"""Exception indicating a problem during glossary rewrite"""

pass


def _get_formatted_glossary_row(row) -> str:
"""Format one row as HTML"""
word = row.find("td", attrs={"data-th": "Word(s)"}).text
definition = row.find("td", attrs={"data-th": "Definition"}).text
return (

Check warning on line 14 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L12-L14

Added lines #L12 - L14 were not covered by tests
'<p class="glossaryElement">\n'
f' <span class="glossaryTerm">{word}</span>\n'
" |\n"
f' <span class="glossaryDefinition">{definition}</span>\n'
"</p>\n"
)


def rewrite_glossary(original_content: str) -> str:
"""Statically rewrite the glossary of libretexts.org
Only word and description columns are supported.
"""

soup = BeautifulSoup(

Check warning on line 29 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L29

Added line #L29 was not covered by tests
original_content,
"html.parser", # prefer html.parser to not add <html><body> tags
)

glossary_table = None

Check warning on line 34 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L34

Added line #L34 was not covered by tests

for table in soup.find_all("table"):
if not table.caption:
continue

Check warning on line 38 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L38

Added line #L38 was not covered by tests
if table.caption and table.caption.text.strip() == "Example and Directions":
continue

Check warning on line 40 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L40

Added line #L40 was not covered by tests
if glossary_table:
raise GlossaryRewriteError("Too many glossary tables")
glossary_table = table

Check warning on line 43 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L42-L43

Added lines #L42 - L43 were not covered by tests

if not glossary_table:
raise GlossaryRewriteError("Glossary table not found")

Check warning on line 46 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L46

Added line #L46 was not covered by tests

tbody = glossary_table.find("tbody")

Check warning on line 48 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L48

Added line #L48 was not covered by tests
if not tbody:
raise GlossaryRewriteError("Glossary table body not found")

Check warning on line 50 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L50

Added line #L50 was not covered by tests

glossary_table.insert_after(

Check warning on line 52 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L52

Added line #L52 was not covered by tests
BeautifulSoup(
"".join([_get_formatted_glossary_row(row) for row in tbody.find_all("tr")]),
"html.parser", # prefer html.parser to not add <html><body> tags
)
)

# remove all tables and scripts
for item in soup.find_all("table") + soup.find_all("script"):
item.decompose()
return soup.prettify()

Check warning on line 62 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L61-L62

Added lines #L61 - L62 were not covered by tests
15 changes: 10 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -474,7 +475,13 @@ def _process_page(
post_head_insert=None,
notify_js_module=None,
)
rewriten = rewriter.rewrite(page_content.html_body)
if (
self.mindtouch_client.library_url.endswith(".libretexts.org")
and page.title == "Glossary"
):
rewriten = rewrite_glossary(page_content.html_body)

Check warning on line 482 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L482

Added line #L482 was not covered by tests
else:
rewriten = rewriter.rewrite(page_content.html_body).content

Check warning on line 484 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L484

Added line #L484 was not covered by tests
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
Expand All @@ -484,14 +491,12 @@ def _process_page(
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
content=PageContentModel(html_body=rewriten.content).model_dump_json(
by_alias=True
),
content=PageContentModel(html_body=rewriten).model_dump_json(by_alias=True),
)
self._add_indexing_item_to_zim(
creator=creator,
title=page.title,
content=get_text(rewriten.content),
content=get_text(rewriten),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)
Expand Down
25 changes: 25 additions & 0 deletions zimui/public/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,28 @@ a[href^="https://"]:after
height: auto;
display: block;
}

/* additional CSS for libretexts.org glossary pages, reproduced here for simplificity
and maintainability (less sensitive to file move upstream)
*/

.glossaryTerm {
font-weight: bold;
}

p.glossaryElement {
font-size: 1em;
margin: 0.5rem 0 1em;
}

@media print {
.glossaryDefinition {
font-size: 0.85em;
}
p.glossaryElement {
break-inside: avoid;
margin: 0.4rem 0;
font-size: 0.9em;
line-height: 0.9em;
}
}

0 comments on commit 4a9c441

Please sign in to comment.