diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py index 6734d13..79fd8cf 100644 --- a/scraper/src/mindtouch2zim/client.py +++ b/scraper/src/mindtouch2zim/client.py @@ -13,6 +13,7 @@ logger, web_session, ) +from mindtouch2zim.html import get_soup class MindtouchParsingError(Exception): @@ -173,7 +174,7 @@ def get_home(self) -> MindtouchHome: """Retrieves data about home page by crawling home page""" home_content = self._get_text("/") - soup = _get_soup(home_content) + soup = get_soup(home_content) self.deki_token = _get_deki_token_from_home(soup) return MindtouchHome( welcome_text_paragraphs=_get_welcome_text_from_home(soup), @@ -192,7 +193,7 @@ def get_deki_token(self) -> str: home_content = self._get_text("/") - soup = _get_soup(home_content) + soup = get_soup(home_content) self.deki_token = _get_deki_token_from_home(soup) return self.deki_token @@ -290,14 +291,6 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent: return LibraryPageContent(html_body=tree["body"][0]) -def _get_soup(content: str) -> BeautifulSoup: - """Return a BeautifulSoup soup from textual content - - This is a utility function to ensure same parser is used in the whole codebase - """ - return BeautifulSoup(content, "lxml") - - def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str: """Return the URL of the image found on home header""" branding_div = soup.find("div", class_="LTBranding") diff --git a/scraper/src/mindtouch2zim/html.py b/scraper/src/mindtouch2zim/html.py new file mode 100644 index 0000000..887d817 --- /dev/null +++ b/scraper/src/mindtouch2zim/html.py @@ -0,0 +1,17 @@ +from bs4 import BeautifulSoup + + +def get_soup(content: str) -> BeautifulSoup: + """Return a BeautifulSoup soup from HTML content + + This is a utility function to ensure same parser is used in the whole codebase + """ + return BeautifulSoup(content, "lxml") + + +def get_text(content: str) -> str: + """Return text data from HTML content + + This is typically meant to extract content to index in the ZIM + """ + return get_soup(content).getText("\n", strip=True) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 4d418f3..6a23802 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -42,6 +42,7 @@ logger, web_session, ) +from mindtouch2zim.html import get_text from mindtouch2zim.ui import ( ConfigModel, PageContentModel, @@ -539,6 +540,13 @@ def _process_page( by_alias=True ), ) + self._add_indexing_item_to_zim( + creator=creator, + title=page.title, + content=get_text(rewriten.content), + fname=f"page_{page.id}", + zimui_redirect=page.path, + ) def _report_progress(self): """report progress to stats file""" @@ -604,6 +612,39 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO: ) return favicon + def _add_indexing_item_to_zim( + self, + creator: Creator, + title: str, + content: str, + fname: str, + zimui_redirect: str, + ): + """Add a 'fake' item to the ZIM, with proper indexing data + + This is mandatory for suggestions and fulltext search to work properly, since + we do not really have pages to search for. + + This item is a very basic HTML which automatically redirect to proper Vue.JS URL + """ + + redirect_url = f"../index.html#/{zimui_redirect}" + html_content = ( + f"