Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index pages for suggestions and full-text search #50

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_soup


class MindtouchParsingError(Exception):
Expand Down Expand Up @@ -173,7 +174,7 @@
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)

Check warning on line 177 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L177

Added line #L177 was not covered by tests
self.deki_token = _get_deki_token_from_home(soup)
return MindtouchHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
Expand All @@ -192,7 +193,7 @@

home_content = self._get_text("/")

soup = _get_soup(home_content)
soup = get_soup(home_content)

Check warning on line 196 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L196

Added line #L196 was not covered by tests
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

Expand Down Expand Up @@ -290,14 +291,6 @@
return LibraryPageContent(html_body=tree["body"][0])


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content

This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
Expand Down
17 changes: 17 additions & 0 deletions scraper/src/mindtouch2zim/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bs4 import BeautifulSoup


def get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from HTML content

This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def get_text(content: str) -> str:
"""Return text data from HTML content

This is typically meant to extract content to index in the ZIM
"""
return get_soup(content).getText("\n", strip=True)

Check warning on line 17 in scraper/src/mindtouch2zim/html.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html.py#L17

Added line #L17 was not covered by tests
41 changes: 41 additions & 0 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
logger,
web_session,
)
from mindtouch2zim.html import get_text
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -539,6 +540,13 @@
by_alias=True
),
)
self._add_indexing_item_to_zim(

Check warning on line 543 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L543

Added line #L543 was not covered by tests
creator=creator,
title=page.title,
content=get_text(rewriten.content),
fname=f"page_{page.id}",
zimui_redirect=page.path,
)

def _report_progress(self):
"""report progress to stats file"""
Expand Down Expand Up @@ -604,6 +612,39 @@
)
return favicon

def _add_indexing_item_to_zim(
self,
creator: Creator,
title: str,
content: str,
fname: str,
zimui_redirect: str,
):
"""Add a 'fake' item to the ZIM, with proper indexing data

This is mandatory for suggestions and fulltext search to work properly, since
we do not really have pages to search for.

This item is a very basic HTML which automatically redirect to proper Vue.JS URL
"""

redirect_url = f"../index.html#/{zimui_redirect}"
html_content = (

Check warning on line 632 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L631-L632

Added lines #L631 - L632 were not covered by tests
f"<html><head><title>{title}</title>"
f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
f"</head><body></body></html>"
)

logger.debug(f"Adding {fname} to ZIM index")
add_item_for(

Check warning on line 639 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L638-L639

Added lines #L638 - L639 were not covered by tests
creator=creator,
title=title,
path="index/" + fname,
content=bytes(html_content, "utf-8"),
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
mimetype="text/html",
index_data=IndexData(title=title, content=content),
)


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down
4 changes: 2 additions & 2 deletions scraper/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest

from mindtouch2zim.client import (
_get_soup, # pyright: ignore[reportPrivateUsage]
_get_welcome_text_from_home, # pyright: ignore[reportPrivateUsage]
)
from mindtouch2zim.html import get_soup


@pytest.mark.parametrize(
Expand Down Expand Up @@ -54,4 +54,4 @@
],
)
def test_get_welcome_text_from_home(content: str, expected: str):
assert _get_welcome_text_from_home(_get_soup(content)) == expected
assert _get_welcome_text_from_home(get_soup(content)) == expected