Skip to content

Commit

Permalink
Support more cases, including international libraries
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 13, 2024
1 parent 7cd4d29 commit c516446
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 15 deletions.
19 changes: 7 additions & 12 deletions scraper/src/mindtouch2zim/libretexts/glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _get_formatted_glossary_row(row) -> str:
)


def rewrite_glossary(original_content: str) -> str:
def rewrite_glossary(original_content: str) -> str | None:
"""Statically rewrite the glossary of libretexts.org
Only word and description columns are supported.
Expand All @@ -33,17 +33,12 @@ def rewrite_glossary(original_content: str) -> str:

glossary_table = None

Check warning on line 34 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L34

Added line #L34 was not covered by tests

for table in soup.find_all("table"):
if not table.caption:
continue
if table.caption and table.caption.text.strip() == "Example and Directions":
continue
if glossary_table:
raise GlossaryRewriteError("Too many glossary tables")
glossary_table = table

if not glossary_table:
raise GlossaryRewriteError("Glossary table not found")
tables = soup.find_all("table")

Check warning on line 36 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L36

Added line #L36 was not covered by tests
if len(tables) == 0:
# looks like this glossary is not using default template ; let's rewrite as
# a normal page
return None
glossary_table = tables[-1]

Check warning on line 41 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L40-L41

Added lines #L40 - L41 were not covered by tests

tbody = glossary_table.find("tbody")

Check warning on line 43 in scraper/src/mindtouch2zim/libretexts/glossary.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/glossary.py#L43

Added line #L43 was not covered by tests
if not tbody:
Expand Down
7 changes: 4 additions & 3 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,11 +475,12 @@ def _process_page(
post_head_insert=None,
notify_js_module=None,
)
if (
self.mindtouch_client.library_url.endswith(".libretexts.org")
and page.title == "Glossary"
if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match(
r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path
):
rewriten = rewrite_glossary(page_content.html_body)

Check warning on line 481 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L481

Added line #L481 was not covered by tests
if not rewriten:
rewriten = rewriter.rewrite(page_content.html_body).content

Check warning on line 483 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L483

Added line #L483 was not covered by tests
else:
rewriten = rewriter.rewrite(page_content.html_body).content

Check warning on line 485 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L485

Added line #L485 was not covered by tests
for path, urls in url_rewriter.items_to_download.items():
Expand Down

0 comments on commit c516446

Please sign in to comment.