Skip to content

Commit

Permalink
Merge pull request #43 from openzim/handle_videos_iframes
Browse files Browse the repository at this point in the history
Handle videos iframes and add visual indicator of external links
  • Loading branch information
benoit74 authored Oct 29, 2024
2 parents a732a8a + 52559e6 commit 118dab7
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 18 deletions.
2 changes: 1 addition & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
"jinja2==3.1.4",
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@mindtouch_changes",
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"requests==2.32.3",
"types-requests==2.32.0.20240914",
"kiwixstorage==0.9.0",
Expand Down
9 changes: 5 additions & 4 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from bs4 import BeautifulSoup, NavigableString
from pydantic import BaseModel

from mindtouch2zim.constants import logger

HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30
from mindtouch2zim.constants import (
HTTP_TIMEOUT_LONG_SECONDS,
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
)


class MindtouchParsingError(Exception):
Expand Down
3 changes: 3 additions & 0 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@
# As of 2024-09-24, all libraries appears to be in English.
LANGUAGE_ISO_639_3 = "eng"

HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30

logger = getLogger(NAME, level=logging.DEBUG)
96 changes: 83 additions & 13 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from zimscraperlib.image.conversion import convert_svg2png
from zimscraperlib.image.probing import format_for
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.html import HtmlRewriter
from zimscraperlib.rewriting.html import AttrsList, HtmlRewriter, get_attr_value_from
from zimscraperlib.rewriting.html import rules as html_rules
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
Expand All @@ -41,6 +41,7 @@
PageModel,
SharedModel,
)
from mindtouch2zim.vimeo import get_vimeo_thumbnail_url
from mindtouch2zim.zimconfig import ZimConfig


Expand Down Expand Up @@ -582,7 +583,7 @@ def rewrite_href_src_attributes(
if attr_name not in ("href", "src") or not attr_value:
return
if not isinstance(url_rewriter, HtmlUrlsRewriter):
raise Exception("Expecting MindtouchUrlRewriter")
raise Exception("Expecting HtmlUrlsRewriter")
new_attr_value = None
if tag == "a":
rewrite_result = url_rewriter(
Expand All @@ -601,17 +602,7 @@ def rewrite_href_src_attributes(
)
# add 'content/' to the URL since all assets will be stored in the sub.-path
new_attr_value = f"content/{rewrite_result.rewriten_url}"
if rewrite_result.zim_path is not None:
# if item is expected to be inside the ZIM, store asset information so that
# we can download it afterwards
if rewrite_result.zim_path in url_rewriter.items_to_download:
url_rewriter.items_to_download[rewrite_result.zim_path].add(
HttpUrl(rewrite_result.absolute_url)
)
else:
url_rewriter.items_to_download[rewrite_result.zim_path] = {
HttpUrl(rewrite_result.absolute_url)
}
url_rewriter.add_item_to_download(rewrite_result)
if not new_attr_value:
# we do not (yet) support other tags / attributes so we fail the scraper
raise ValueError(
Expand All @@ -634,6 +625,71 @@ def refuse_unsupported_tags(tag: str):
raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper")


YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
VIMEO_IFRAME_RE = re.compile(r".*vimeo(?:-\w+)*\.\w+\/video\/(?:.*?)(?:\?.*)*$")


@html_rules.rewrite_tag()
def rewrite_iframe_tags(
tag: str,
attrs: AttrsList,
base_href: str | None,
url_rewriter: ArticleUrlRewriter,
):
"""Rewrite youtube and vimeo iframes to remove player until video is included"""
if tag not in ["iframe"]:
return
if not isinstance(url_rewriter, HtmlUrlsRewriter):
raise Exception("Expecting HtmlUrlsRewriter")
src = get_attr_value_from(attrs=attrs, name="src")
if not src:
raise UnsupportedTagError("Unsupported empty src in iframe")
image_rewriten_url = None
try:
if ytb_match := YOUTUBE_IFRAME_RE.match(src):
rewrite_result = url_rewriter(
f'https://i.ytimg.com/vi/{ytb_match.group("id")}/hqdefault.jpg',
base_href=base_href,
)
url_rewriter.add_item_to_download(rewrite_result)
image_rewriten_url = rewrite_result.rewriten_url
elif VIMEO_IFRAME_RE.match(src):
rewrite_result = url_rewriter(
get_vimeo_thumbnail_url(src),
base_href=base_href,
)
url_rewriter.add_item_to_download(rewrite_result)
image_rewriten_url = rewrite_result.rewriten_url
else:
logger.debug(f"iframe pointing to {src} will not have any preview")
except Exception as exc:
logger.warning(f"Failed to rewrite iframe with src {src}", exc_info=exc)

if image_rewriten_url:
return (
f'<a href="{src}" target="_blank">'
f'<div class="zim-removed-video">'
f'<img src="content/{image_rewriten_url}">'
"</img>"
"</div>"
"</a>"
'<iframe style="display: none;">' # fake opening tag just to remove iframe
)
else:
# replace iframe with text indicating the online URL which has not been ZIMed
return (
f"This content is not inside the ZIM. "
f'View content online at <a href="{src}" target="_blank">'
f"<div>"
f"{src}"
"</div>"
"</a>"
'<iframe style="display: none;">' # fake opening tag just to remove iframe
)

raise UnsupportedTagError(f"Unsupported src {src} in iframe")


class HtmlUrlsRewriter(ArticleUrlRewriter):
"""A rewriter for HTML processing
Expand All @@ -659,6 +715,20 @@ def __call__(
result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url)
return result

def add_item_to_download(self, rewrite_result: RewriteResult):
"""Add item to download based on rewrite result"""
if rewrite_result.zim_path is not None:
# if item is expected to be inside the ZIM, store asset information so that
# we can download it afterwards
if rewrite_result.zim_path in self.items_to_download:
self.items_to_download[rewrite_result.zim_path].add(
HttpUrl(rewrite_result.absolute_url)
)
else:
self.items_to_download[rewrite_result.zim_path] = {
HttpUrl(rewrite_result.absolute_url)
}


class CssUrlsRewriter(ArticleUrlRewriter):
"""A rewriter for CSS processing, storing items to download as URL as processed"""
Expand Down
30 changes: 30 additions & 0 deletions scraper/src/mindtouch2zim/vimeo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import requests

from mindtouch2zim.constants import (
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
)


class VimeoThumbnailError(Exception):
"""Error raised when there is a problem with a vimeo video"""

pass


def get_vimeo_thumbnail_url(video_url: str) -> str:
"""From a vimeo URL - player or normal - retrieve corresponding thumbnail URL"""
resp = requests.get(
f"https://vimeo.com/api/oembed.json?url={video_url}",
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
resp.raise_for_status()
json_doc = resp.json()
if "thumbnail_url" not in json_doc:
logger.warning(f"Failed to find thumbnail_url in response:\n{resp.text}")
raise VimeoThumbnailError("API response misses the thumbnail_url")
thumbnail_url = json_doc["thumbnail_url"]
if not thumbnail_url:
logger.warning(f"Emtpy thumbnail_url in response:\n{resp.text}")
raise VimeoThumbnailError("API response has empty thumbnail_url")
return thumbnail_url
1 change: 1 addition & 0 deletions zimui/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
<link rel="stylesheet" type="text/css" media="screen" href="./content/screen.css" />
<link rel="stylesheet" type="text/css" media="print" href="./content/print.css" />
<link rel="stylesheet" type="text/css" href="./content/inline.css" />
<link rel="stylesheet" type="text/css" href="./custom.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Vite App</title>
</head>
Expand Down
41 changes: 41 additions & 0 deletions zimui/public/custom.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
a[href^="http://"]:after,
a[href^="https://"]:after
{
content: '';
display: inline-block;
width: 10px;
height: 10px;
background-image: url('external-link.svg');
background-size: contain;
background-repeat: no-repeat;
margin-left: 5px;
position: relative;
bottom: 0px;
right: 0px;
}

.zim-removed-video {
position: relative;
display: inline-block;
width: 100%;
height: auto;
}

.zim-removed-video::before {
content: '';
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-image: url('play-button.svg');
background-repeat: no-repeat;
background-position: center;
background-size: 30%;
}

.zim-removed-video img {
width: 100%;
height: auto;
display: block;
}
15 changes: 15 additions & 0 deletions zimui/public/external-link.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 14 additions & 0 deletions zimui/public/play-button.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 118dab7

Please sign in to comment.