Skip to content

Commit

Permalink
Index pages for suggestions and full-text search
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 29, 2024
1 parent 44483b1 commit 0b33ccb
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 12 deletions.
13 changes: 3 additions & 10 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_soup


class MindtouchParsingError(Exception):
Expand Down Expand Up @@ -173,7 +174,7 @@ def get_home(self) -> MindtouchHome:
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)

Check warning on line 177 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L177

Added line #L177 was not covered by tests
self.deki_token = _get_deki_token_from_home(soup)
return MindtouchHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
Expand All @@ -192,7 +193,7 @@ def get_deki_token(self) -> str:

home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)

Check warning on line 196 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L196

Added line #L196 was not covered by tests
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

Expand Down Expand Up @@ -290,14 +291,6 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
return LibraryPageContent(html_body=tree["body"][0])


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
Expand Down
17 changes: 17 additions & 0 deletions scraper/src/mindtouch2zim/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bs4 import BeautifulSoup


def get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from HTML content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def get_text(content: str) -> str:
"""Return text data from HTML content
This is typically meant to extract content to index in the ZIM
"""
return get_soup(content).getText("\n", strip=True)

Check warning on line 17 in scraper/src/mindtouch2zim/html.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html.py#L17

Added line #L17 was not covered by tests
41 changes: 41 additions & 0 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_text
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -539,6 +540,13 @@ def _process_page(
by_alias=True
),
)
self._add_indexing_item_to_zim(

Check warning on line 543 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L543

Added line #L543 was not covered by tests
creator=creator,
title=page.title,
content=get_text(rewriten.content),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)

def _report_progress(self):
"""report progress to stats file"""
Expand Down Expand Up @@ -604,6 +612,39 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO:
)
return favicon

def _add_indexing_item_to_zim(
self,
creator: Creator,
title: str,
content: str,
fname: str,
zimui_redirect: str,
):
"""Add a 'fake' item to the ZIM, with proper indexing data
This is mandatory for suggestions and fulltext search to work properly, since
we do not really have pages to search for.
This item is a very basic HTML which automatically redirect to proper Vue.JS URL
"""

redirect_url = f"../index.html#/{zimui_redirect}"
html_content = (

Check warning on line 632 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L631-L632

Added lines #L631 - L632 were not covered by tests
f"<html><head><title>{title}</title>"
f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
f"</head><body></body></html>"
)

logger.debug(f"Adding {fname} to ZIM index")
add_item_for(

Check warning on line 639 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L638-L639

Added lines #L638 - L639 were not covered by tests
creator=creator,
title=title,
path="index/" + fname,
content=bytes(html_content, "utf-8"),
mimetype="text/html",
index_data=IndexData(title=title, content=content),
)


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down
4 changes: 2 additions & 2 deletions scraper/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest

from mindtouch2zim.client import (
_get_soup, # pyright: ignore[reportPrivateUsage]
_get_welcome_text_from_home, # pyright: ignore[reportPrivateUsage]
)
from mindtouch2zim.html import get_soup


@pytest.mark.parametrize(
Expand Down Expand Up @@ -54,4 +54,4 @@
],
)
def test_get_welcome_text_from_home(content: str, expected: str):
assert _get_welcome_text_from_home(_get_soup(content)) == expected
assert _get_welcome_text_from_home(get_soup(content)) == expected

0 comments on commit 0b33ccb

Please sign in to comment.