openzim · benoit74 · Nov 13, 2024 · Nov 11, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/scraper/src/mindtouch2zim/libretexts/glossary.py b/scraper/src/mindtouch2zim/libretexts/glossary.py
@@ -9,9 +9,9 @@

 def _get_formatted_glossary_row(row) -> str:
    """Format one row as HTML"""
    word = row.find("td", attrs={"data-th": "Word(s)"}).text
    definition = row.find("td", attrs={"data-th": "Definition"}).text
    return (
        '<p class="glossaryElement">\n'
        f'  <span class="glossaryTerm">{word}</span>\n'
        "  |\n"
@@ -20,36 +20,31 @@
     )
 
 
-def rewrite_glossary(original_content: str) -> str:
+def rewrite_glossary(original_content: str) -> str | None:
     """Statically rewrite the glossary of libretexts.org
 
     Only word and description columns are supported.
    """

    soup = BeautifulSoup(
        original_content,
        "html.parser",  # prefer html.parser to not add <html><body> tags
    )
 
     glossary_table = None
 
-    for table in soup.find_all("table"):
-        if not table.caption:
-            continue
-        if table.caption and table.caption.text.strip() == "Example and Directions":
-            continue
-        if glossary_table:
-            raise GlossaryRewriteError("Too many glossary tables")
-        glossary_table = table
-
-    if not glossary_table:
-        raise GlossaryRewriteError("Glossary table not found")
+    tables = soup.find_all("table")
+    if len(tables) == 0:
+        # looks like this glossary is not using default template ; let's rewrite as
+        # a normal page
+        return None
+    glossary_table = tables[-1]
 
     tbody = glossary_table.find("tbody")
     if not tbody:
        raise GlossaryRewriteError("Glossary table body not found")

    glossary_table.insert_after(
        BeautifulSoup(
            "".join([_get_formatted_glossary_row(row) for row in tbody.find_all("tr")]),
            "html.parser",  # prefer html.parser to not add <html><body> tags
@@ -58,5 +53,5 @@

    # remove all tables and scripts
    for item in soup.find_all("table") + soup.find_all("script"):
        item.decompose()
    return soup.prettify()
diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
@@ -475,13 +475,14 @@
             post_head_insert=None,
             notify_js_module=None,
         )
-        if (
-            self.mindtouch_client.library_url.endswith(".libretexts.org")
-            and page.title == "Glossary"
+        if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match(
+            r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path
         ):
             rewriten = rewrite_glossary(page_content.html_body)
+            if not rewriten:
+                rewriten = rewriter.rewrite(page_content.html_body).content
         else:
             rewriten = rewriter.rewrite(page_content.html_body).content
         for path, urls in url_rewriter.items_to_download.items():
            if path in self.items_to_download:
                self.items_to_download[path].urls.update(urls)