Merge pull request #50 from openzim/populate_search_indexes

Index pages for suggestions and full-text search
openzim · Oct 29, 2024 · 607b60f · 607b60f
2 parents bc02d11 + e69b833
commit 607b60f
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 12 deletions.
diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py
@@ -13,6 +13,7 @@
     logger,
     web_session,
 )
+from mindtouch2zim.html import get_soup
 
 
 class MindtouchParsingError(Exception):
@@ -173,7 +174,7 @@ def get_home(self) -> MindtouchHome:
         """Retrieves data about home page by crawling home page"""
         home_content = self._get_text("/")
 
-        soup = _get_soup(home_content)
+        soup = get_soup(home_content)
         self.deki_token = _get_deki_token_from_home(soup)
         return MindtouchHome(
             welcome_text_paragraphs=_get_welcome_text_from_home(soup),
@@ -192,7 +193,7 @@ def get_deki_token(self) -> str:
 
         home_content = self._get_text("/")
 
-        soup = _get_soup(home_content)
+        soup = get_soup(home_content)
         self.deki_token = _get_deki_token_from_home(soup)
         return self.deki_token
 
@@ -290,14 +291,6 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
         return LibraryPageContent(html_body=tree["body"][0])
 
 
-def _get_soup(content: str) -> BeautifulSoup:
-    """Return a BeautifulSoup soup from textual content
-
-    This is a utility function to ensure same parser is used in the whole codebase
-    """
-    return BeautifulSoup(content, "lxml")
-
-
 def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
     """Return the URL of the image found on home header"""
     branding_div = soup.find("div", class_="LTBranding")

diff --git a/scraper/src/mindtouch2zim/html.py b/scraper/src/mindtouch2zim/html.py
@@ -0,0 +1,17 @@
+from bs4 import BeautifulSoup
+
+
+def get_soup(content: str) -> BeautifulSoup:
+    """Return a BeautifulSoup soup from HTML content
+
+    This is a utility function to ensure same parser is used in the whole codebase
+    """
+    return BeautifulSoup(content, "lxml")
+
+
+def get_text(content: str) -> str:
+    """Return text data from HTML content
+
+    This is typically meant to extract content to index in the ZIM
+    """
+    return get_soup(content).getText("\n", strip=True)
diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
@@ -42,6 +42,7 @@
     logger,
     web_session,
 )
+from mindtouch2zim.html import get_text
 from mindtouch2zim.ui import (
     ConfigModel,
     PageContentModel,
@@ -539,6 +540,13 @@ def _process_page(
                 by_alias=True
             ),
         )
+        self._add_indexing_item_to_zim(
+            creator=creator,
+            title=page.title,
+            content=get_text(rewriten.content),
+            fname=f"page_{page.id}",
+            zimui_redirect=page.path,
+        )
 
     def _report_progress(self):
         """report progress to stats file"""
@@ -604,6 +612,39 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO:
         )
         return favicon
 
+    def _add_indexing_item_to_zim(
+        self,
+        creator: Creator,
+        title: str,
+        content: str,
+        fname: str,
+        zimui_redirect: str,
+    ):
+        """Add a 'fake' item to the ZIM, with proper indexing data
+
+        This is mandatory for suggestions and fulltext search to work properly, since
+        we do not really have pages to search for.
+
+        This item is a very basic HTML which automatically redirect to proper Vue.JS URL
+        """
+
+        redirect_url = f"../index.html#/{zimui_redirect}"
+        html_content = (
+            f"<html><head><title>{title}</title>"
+            f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
+            f"</head><body></body></html>"
+        )
+
+        logger.debug(f"Adding {fname} to ZIM index")
+        add_item_for(
+            creator=creator,
+            title=title,
+            path="index/" + fname,
+            content=bytes(html_content, "utf-8"),
+            mimetype="text/html",
+            index_data=IndexData(title=title, content=content),
+        )
+
 
 # remove all standard rules, they are not adapted to Vue.JS UI
 html_rules.rewrite_attribute_rules.clear()

diff --git a/scraper/tests/test_client.py b/scraper/tests/test_client.py
@@ -1,9 +1,9 @@
 import pytest
 
 from mindtouch2zim.client import (
-    _get_soup,  # pyright: ignore[reportPrivateUsage]
     _get_welcome_text_from_home,  # pyright: ignore[reportPrivateUsage]
 )
+from mindtouch2zim.html import get_soup
 
 
 @pytest.mark.parametrize(
@@ -54,4 +54,4 @@
     ],
 )
 def test_get_welcome_text_from_home(content: str, expected: str):
-    assert _get_welcome_text_from_home(_get_soup(content)) == expected
+    assert _get_welcome_text_from_home(get_soup(content)) == expected