Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle libretexts.org glossary rewriting #66

Merged
merged 3 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/glossary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from bs4 import BeautifulSoup


class GlossaryRewriteError(Exception):
"""Exception indicating a problem during glossary rewrite"""

pass


def _get_formatted_glossary_row(row) -> str:
"""Format one row as HTML"""
word = row.find("td", attrs={"data-th": "Word(s)"}).text
definition = row.find("td", attrs={"data-th": "Definition"}).text
return (

Check warning on line 14 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L12-L14

Added lines #L12 - L14 were not covered by tests
'<p class="glossaryElement">\n'
f' <span class="glossaryTerm">{word}</span>\n'
" |\n"
f' <span class="glossaryDefinition">{definition}</span>\n'
"</p>\n"
)


def rewrite_glossary(original_content: str) -> str | None:
"""Statically rewrite the glossary of libretexts.org

Only word and description columns are supported.
"""

soup = BeautifulSoup(

Check warning on line 29 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L29

Added line #L29 was not covered by tests
original_content,
"html.parser", # prefer html.parser to not add <html><body> tags
)

glossary_table = None

Check warning on line 34 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L34

Added line #L34 was not covered by tests

tables = soup.find_all("table")

Check warning on line 36 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L36

Added line #L36 was not covered by tests
if len(tables) == 0:
# looks like this glossary is not using default template ; let's rewrite as
# a normal page
return None
glossary_table = tables[-1]

Check warning on line 41 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L40-L41

Added lines #L40 - L41 were not covered by tests

tbody = glossary_table.find("tbody")

Check warning on line 43 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L43

Added line #L43 was not covered by tests
if not tbody:
raise GlossaryRewriteError("Glossary table body not found")

Check warning on line 45 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L45

Added line #L45 was not covered by tests
rgaudin marked this conversation as resolved.
Show resolved Hide resolved

glossary_table.insert_after(

Check warning on line 47 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L47

Added line #L47 was not covered by tests
BeautifulSoup(
"".join([_get_formatted_glossary_row(row) for row in tbody.find_all("tr")]),
"html.parser", # prefer html.parser to not add <html><body> tags
)
)

# remove all tables and scripts
for item in soup.find_all("table") + soup.find_all("script"):
item.decompose()
return soup.prettify()

Check warning on line 57 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L56-L57

Added lines #L56 - L57 were not covered by tests
22 changes: 17 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -474,7 +475,20 @@
post_head_insert=None,
notify_js_module=None,
)
rewriten = rewriter.rewrite(page_content.html_body)
if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match(
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path
):
# glossary pages on libretexts.org, e.g. "Courses/California_State_Universi
# ty_Los_Angeles/Book:_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbr
# andt_and_Mosher)/zz:_Back_Matter/20:_Glossary", running at https://geo.li
# bretexts.org/Courses/California_State_University_Los_Angeles/Book%3A_An_I
# ntroduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)/zz%3A_Ba
# ck_Matter/20%3A_Glossary
rewriten = rewrite_glossary(page_content.html_body)

Check warning on line 487 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L487

Added line #L487 was not covered by tests
if not rewriten:
rewriten = rewriter.rewrite(page_content.html_body).content

Check warning on line 489 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L489

Added line #L489 was not covered by tests
else:
rewriten = rewriter.rewrite(page_content.html_body).content

Check warning on line 491 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L491

Added line #L491 was not covered by tests
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
Expand All @@ -484,14 +498,12 @@
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
content=PageContentModel(html_body=rewriten.content).model_dump_json(
by_alias=True
),
content=PageContentModel(html_body=rewriten).model_dump_json(by_alias=True),
)
self._add_indexing_item_to_zim(
creator=creator,
title=page.title,
content=get_text(rewriten.content),
content=get_text(rewriten),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)
Expand Down
25 changes: 25 additions & 0 deletions zimui/public/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,28 @@ a[href^="https://"]:after
height: auto;
display: block;
}

/* additional CSS for libretexts.org glossary pages, reproduced here for simplificity
and maintainability (less sensitive to file move upstream)
*/

.glossaryTerm {
font-weight: bold;
}

p.glossaryElement {
font-size: 1em;
margin: 0.5rem 0 1em;
}

@media print {
.glossaryDefinition {
font-size: 0.85em;
}
p.glossaryElement {
break-inside: avoid;
margin: 0.4rem 0;
font-size: 0.9em;
line-height: 0.9em;
}
}