Skip to content

Commit

Permalink
Merge pull request #49 from openzim/retry_web_calls
Browse files Browse the repository at this point in the history
Use zimscraperlib session to fetch web content with automatic meaningful retries
  • Loading branch information
benoit74 authored Oct 29, 2024
2 parents a2b3708 + 9c69ae6 commit bc02d11
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 14 deletions.
11 changes: 5 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
from pydantic import BaseModel
from requests import Response

from mindtouch2zim.constants import (
HTTP_TIMEOUT_LONG_SECONDS,
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand Down Expand Up @@ -121,7 +122,7 @@ def _get_text(self, url_subpath_and_query: str) -> str:
full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

resp = requests.get(
resp = web_session.get(
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
Expand All @@ -131,12 +132,10 @@ def _get_text(self, url_subpath_and_query: str) -> str:
cache_file.write_text(resp.text)
return resp.text

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
def _get_api_resp(self, api_sub_path_and_query: str, timeout: float) -> Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(
resp = web_session.get(
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
Expand Down
3 changes: 3 additions & 0 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import pathlib

from zimscraperlib.download import get_session
from zimscraperlib.logging import (
getLogger,
)
Expand All @@ -18,3 +19,5 @@
HTTP_TIMEOUT_LONG_SECONDS = 30

logger = getLogger(NAME, level=logging.DEBUG)

web_session = get_session()
24 changes: 19 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@
MindtouchClient,
MindtouchHome,
)
from mindtouch2zim.constants import LANGUAGE_ISO_639_3, NAME, VERSION, logger
from mindtouch2zim.constants import (
LANGUAGE_ISO_639_3,
NAME,
VERSION,
logger,
web_session,
)
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -345,7 +351,9 @@ def run(self) -> Path:
)

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
stream_file(
home.welcome_image_url, byte_stream=welcome_image, session=web_session
)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

Expand Down Expand Up @@ -437,7 +445,11 @@ def run(self) -> Path:
for asset_url in asset_urls:
try:
asset_content = BytesIO()
stream_file(asset_url.value, byte_stream=asset_content)
stream_file(
asset_url.value,
byte_stream=asset_content,
session=web_session,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
Expand Down Expand Up @@ -474,7 +486,7 @@ def _process_css(
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
stream_file(css_location, byte_stream=css_buffer, session=web_session)
css_content = css_buffer.getvalue()
url_rewriter = CssUrlsRewriter(
article_url=HttpUrl(css_location),
Expand Down Expand Up @@ -548,7 +560,9 @@ def _fetch_zim_illustration(self, home: MindtouchHome) -> BytesIO:
try:
logger.debug(f"Downloading {icon_url} illustration")
illustration_content = BytesIO()
stream_file(icon_url, byte_stream=illustration_content)
stream_file(
icon_url, byte_stream=illustration_content, session=web_session
)
illustration_format = format_for(
illustration_content, from_suffix=False
)
Expand Down
5 changes: 2 additions & 3 deletions scraper/src/mindtouch2zim/vimeo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import requests

from mindtouch2zim.constants import (
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand All @@ -14,7 +13,7 @@ class VimeoThumbnailError(Exception):

def get_vimeo_thumbnail_url(video_url: str) -> str:
"""From a vimeo URL - player or normal - retrieve corresponding thumbnail URL"""
resp = requests.get(
resp = web_session.get(
f"https://vimeo.com/api/oembed.json?url={video_url}",
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
Expand Down

0 comments on commit bc02d11

Please sign in to comment.