From 477304fcfc3bf84bbcb8bedc24d56ceb4cd4233c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 13 Nov 2024 10:07:51 +0000 Subject: [PATCH] Add sample path and URL for glossary pages --- scraper/src/mindtouch2zim/processor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 6a04263..003c390 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -478,6 +478,12 @@ def _process_page( if self.mindtouch_client.library_url.endswith(".libretexts.org") and re.match( r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path ): + # glossary pages on libretexts.org, e.g. "Courses/California_State_Universi + # ty_Los_Angeles/Book:_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbr + # andt_and_Mosher)/zz:_Back_Matter/20:_Glossary", running at https://geo.li + # bretexts.org/Courses/California_State_University_Los_Angeles/Book%3A_An_I + # ntroduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)/zz%3A_Ba + # ck_Matter/20%3A_Glossary rewriten = rewrite_glossary(page_content.html_body) if not rewriten: rewriten = rewriter.rewrite(page_content.html_body).content