diff --git a/scraper/src/mindtouch2zim/libretexts/glossary.py b/scraper/src/mindtouch2zim/libretexts/glossary.py index 3701895..e8dc007 100644 --- a/scraper/src/mindtouch2zim/libretexts/glossary.py +++ b/scraper/src/mindtouch2zim/libretexts/glossary.py @@ -20,7 +20,7 @@ def _get_formatted_glossary_row(row) -> str: ) -def rewrite_glossary(original_content: str) -> str: +def rewrite_glossary(original_content: str) -> str | None: """Statically rewrite the glossary of libretexts.org Only word and description columns are supported. @@ -33,17 +33,12 @@ def rewrite_glossary(original_content: str) -> str: glossary_table = None - for table in soup.find_all("table"): - if not table.caption: - continue - if table.caption and table.caption.text.strip() == "Example and Directions": - continue - if glossary_table: - raise GlossaryRewriteError("Too many glossary tables") - glossary_table = table - - if not glossary_table: - raise GlossaryRewriteError("Glossary table not found") + tables = soup.find_all("table") + if len(tables) == 0: + # looks like this glossary is not using default template ; let's rewrite as + # a normal page + return None + glossary_table = tables[-1] tbody = glossary_table.find("tbody") if not tbody: diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index b0845a8..6a04263 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -475,11 +475,12 @@ def _process_page( post_head_insert=None, notify_js_module=None, ) - if ( - self.mindtouch_client.library_url.endswith(".libretexts.org") - and page.title == "Glossary" + if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match( + r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path ): rewriten = rewrite_glossary(page_content.html_body) + if not rewriten: + rewriten = rewriter.rewrite(page_content.html_body).content else: rewriten = rewriter.rewrite(page_content.html_body).content for path, urls in url_rewriter.items_to_download.items():