Skip to content

Commit

Permalink
Merge pull request #50 from openzim/populate_search_indexes
Browse files Browse the repository at this point in the history
Index pages for suggestions and full-text search
  • Loading branch information
benoit74 authored Oct 29, 2024
2 parents bc02d11 + e69b833 commit 607b60f
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 12 deletions.
13 changes: 3 additions & 10 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_soup


class MindtouchParsingError(Exception):
Expand Down Expand Up @@ -173,7 +174,7 @@ def get_home(self) -> MindtouchHome:
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return MindtouchHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
Expand All @@ -192,7 +193,7 @@ def get_deki_token(self) -> str:

home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

Expand Down Expand Up @@ -290,14 +291,6 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
return LibraryPageContent(html_body=tree["body"][0])


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
Expand Down
17 changes: 17 additions & 0 deletions scraper/src/mindtouch2zim/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bs4 import BeautifulSoup


def get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from HTML content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def get_text(content: str) -> str:
"""Return text data from HTML content
This is typically meant to extract content to index in the ZIM
"""
return get_soup(content).getText("\n", strip=True)
41 changes: 41 additions & 0 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_text
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -539,6 +540,13 @@ def _process_page(
by_alias=True
),
)
self._add_indexing_item_to_zim(
creator=creator,
title=page.title,
content=get_text(rewriten.content),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)

def _report_progress(self):
"""report progress to stats file"""
Expand Down Expand Up @@ -604,6 +612,39 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO:
)
return favicon

def _add_indexing_item_to_zim(
self,
creator: Creator,
title: str,
content: str,
fname: str,
zimui_redirect: str,
):
"""Add a 'fake' item to the ZIM, with proper indexing data
This is mandatory for suggestions and fulltext search to work properly, since
we do not really have pages to search for.
This item is a very basic HTML which automatically redirect to proper Vue.JS URL
"""

redirect_url = f"../index.html#/{zimui_redirect}"
html_content = (
f"<html><head><title>{title}</title>"
f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
f"</head><body></body></html>"
)

logger.debug(f"Adding {fname} to ZIM index")
add_item_for(
creator=creator,
title=title,
path="index/" + fname,
content=bytes(html_content, "utf-8"),
mimetype="text/html",
index_data=IndexData(title=title, content=content),
)


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down
4 changes: 2 additions & 2 deletions scraper/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest

from mindtouch2zim.client import (
_get_soup, # pyright: ignore[reportPrivateUsage]
_get_welcome_text_from_home, # pyright: ignore[reportPrivateUsage]
)
from mindtouch2zim.html import get_soup


@pytest.mark.parametrize(
Expand Down Expand Up @@ -54,4 +54,4 @@
],
)
def test_get_welcome_text_from_home(content: str, expected: str):
assert _get_welcome_text_from_home(_get_soup(content)) == expected
assert _get_welcome_text_from_home(get_soup(content)) == expected

0 comments on commit 607b60f

Please sign in to comment.