Skip to content

Commit

Permalink
Add missing tests of HTML rewriting
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 5, 2024
1 parent 0f43b02 commit 3a7a734
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 4 deletions.
6 changes: 6 additions & 0 deletions scraper/src/mindtouch2zim/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ class UnsupportedTagError(Exception):
pass


class UnsupportedHrefSrcError(Exception):
"""An exception raised when an href or src is not expected to be encountered"""

pass


class NoIllustrationFoundError(Exception):
"""An exception raised when no suitable illustration has been found"""

Expand Down
6 changes: 3 additions & 3 deletions scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from mindtouch2zim.client import LibraryPage
from mindtouch2zim.constants import logger
from mindtouch2zim.errors import UnsupportedTagError
from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
from mindtouch2zim.utils import is_better_srcset_descriptor
from mindtouch2zim.vimeo import get_vimeo_thumbnail_url

Expand Down Expand Up @@ -52,8 +52,8 @@ def rewrite_href_src_attributes(
)
if not new_attr_value:
# we do not (yet) support other tags / attributes so we fail the scraper
raise ValueError(
f"Empty new value when rewriting {attr_value} from {attr_name} in {tag} tag"
raise UnsupportedHrefSrcError(
f"Unsupported {attr_name} encountered in {tag} tag (value: {attr_value})"
)
return (attr_name, new_attr_value)

Expand Down
141 changes: 140 additions & 1 deletion scraper/tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)

from mindtouch2zim.client import LibraryPage
from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter


Expand All @@ -14,7 +15,9 @@ def url_rewriter() -> HtmlUrlsRewriter:
return HtmlUrlsRewriter(
library_url="https://www.acme.com",
page=LibraryPage(id="123", title="a page", path="A_Page"),
existing_zim_paths=set(),
existing_zim_paths={
ZimPath("www.acme.com/existing.html"),
},
)


Expand Down Expand Up @@ -97,3 +100,139 @@ def test_html_img_rewriting(
assert rewritten.content == expected_html
assert rewritten.title == ""
assert url_rewriter.items_to_download == expected_items_to_download


@pytest.mark.parametrize(
"source_html, expected_html, expected_items_to_download",
[
pytest.param(
'<iframe src="https://www.youtube.com/embed/sQaEthBmZB0?vq=hd1080" '
'frameborder="0" allowfullscreen="true" '
'style="width: 100%; height: 100%; position: absolute"></iframe>',
'<a href="https://www.youtube.com/embed/sQaEthBmZB0?vq=hd1080" '
'target="_blank">'
'<div class="zim-removed-video">'
'<img src="content/i.ytimg.com.fuzzy.replayweb.page/'
'vi/sQaEthBmZB0/thumbnail.jpg"></img>'
"</div>"
"</a>"
'<iframe style="display: none;"></iframe>',
{
ZimPath(
"i.ytimg.com.fuzzy.replayweb.page/vi/sQaEthBmZB0/thumbnail.jpg"
): {HttpUrl("https://i.ytimg.com/vi/sQaEthBmZB0/hqdefault.jpg")}
},
id="youtube",
),
pytest.param(
'<iframe src="https://player.vimeo.com/video/153300296" '
'frameborder="0" allowfullscreen="true" '
'style="width: 100%; height: 100%; position: absolute"></iframe>',
'<a href="https://player.vimeo.com/video/153300296" '
'target="_blank">'
'<div class="zim-removed-video">'
'<img src="content/i.vimeocdn.com/video/553546340-'
'35aa6d23b04e9bdaf254c3cfc4da56bcfd7ff3f75a517c485536082edbf547dd-d_640">'
"</img>"
"</div>"
"</a>"
'<iframe style="display: none;"></iframe>',
{
ZimPath(
"i.vimeocdn.com/video/553546340-"
"35aa6d23b04e9bdaf254c3cfc4da56bcfd7ff3f75a517c485536082edbf547dd-"
"d_640"
): {
HttpUrl(
"https://i.vimeocdn.com/video/553546340-"
"35aa6d23b04e9bdaf254c3cfc4da56bcfd7ff3f75a517c485536082e"
"dbf547dd-d_640"
)
}
},
id="vimeo",
),
pytest.param(
'<iframe src="https://www.acme.com/embed/sQaEthBmZB0?vq=hd1080" '
'frameborder="0" allowfullscreen="true" '
'style="width: 100%; height: 100%; position: absolute"></iframe>',
"This content is not inside the ZIM. View content online at "
'<a href="https://www.acme.com/embed/sQaEthBmZB0?vq=hd1080" '
'target="_blank">'
"<div>https://www.acme.com/embed/sQaEthBmZB0?vq=hd1080</div>"
"</a>"
'<iframe style="display: none;"></iframe>',
{},
id="unhandled",
),
],
)
def test_html_iframe_rewriting(
url_rewriter: HtmlUrlsRewriter,
html_rewriter: HtmlRewriter,
source_html: str,
expected_html: str,
expected_items_to_download: dict[ZimPath, set[HttpUrl]],
):
rewritten = html_rewriter.rewrite(source_html)
assert rewritten.content == expected_html
assert rewritten.title == ""
assert url_rewriter.items_to_download == expected_items_to_download


def test_html_picture_rewriting(html_rewriter: HtmlRewriter):
with pytest.raises(UnsupportedTagError):
html_rewriter.rewrite("<picture>")


def test_html_script_rewriting(html_rewriter: HtmlRewriter):
with pytest.raises(UnsupportedHrefSrcError):
html_rewriter.rewrite("<script src='script.js'>")


@pytest.mark.parametrize(
"source_html, expected_html, expected_items_to_download",
[
pytest.param(
'<a href="https://www.acme.com/existing.html">Page</a>',
'<a href="#/existing.html">Page</a>',
{},
id="internal_absolute",
),
pytest.param(
'<a href="/existing.html">Page 1</a>',
'<a href="#/existing.html">Page 1</a>',
{},
id="internal_root",
),
pytest.param(
'<a href="../existing.html">Page 1</a>',
'<a href="#/existing.html">Page 1</a>',
{},
id="internal_relative",
),
pytest.param(
'<a href="../outside.html">Page 1</a>',
'<a href="https://www.acme.com/outside.html">Page 1</a>',
{},
id="external_relative",
),
pytest.param(
'<a href="https://www.foo.bar/index.html">Page 2</a>',
'<a href="https://www.foo.bar/index.html">Page 2</a>',
{},
id="external",
),
],
)
def test_html_href_rewriting(
url_rewriter: HtmlUrlsRewriter,
html_rewriter: HtmlRewriter,
source_html: str,
expected_html: str,
expected_items_to_download: dict[ZimPath, set[HttpUrl]],
):
rewritten = html_rewriter.rewrite(source_html)
assert rewritten.content == expected_html
assert rewritten.title == ""
assert url_rewriter.items_to_download == expected_items_to_download

0 comments on commit 3a7a734

Please sign in to comment.