Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use zimscraperlib session to fetch web content with automatic meaningful retries #49

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
from pydantic import BaseModel
from requests import Response

from mindtouch2zim.constants import (
HTTP_TIMEOUT_LONG_SECONDS,
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand Down Expand Up @@ -121,7 +122,7 @@
full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

resp = requests.get(
resp = web_session.get(

Check warning on line 125 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L125

Added line #L125 was not covered by tests
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
Expand All @@ -131,12 +132,10 @@
cache_file.write_text(resp.text)
return resp.text

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
def _get_api_resp(self, api_sub_path_and_query: str, timeout: float) -> Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(
resp = web_session.get(

Check warning on line 138 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L138

Added line #L138 was not covered by tests
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
Expand Down
3 changes: 3 additions & 0 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import pathlib

from zimscraperlib.download import get_session
from zimscraperlib.logging import (
getLogger,
)
Expand All @@ -18,3 +19,5 @@
HTTP_TIMEOUT_LONG_SECONDS = 30

logger = getLogger(NAME, level=logging.DEBUG)

web_session = get_session()
24 changes: 19 additions & 5 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@
MindtouchClient,
MindtouchHome,
)
from mindtouch2zim.constants import LANGUAGE_ISO_639_3, NAME, VERSION, logger
from mindtouch2zim.constants import (
LANGUAGE_ISO_639_3,
NAME,
VERSION,
logger,
web_session,
)
from mindtouch2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -345,7 +351,9 @@
)

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
stream_file(

Check warning on line 354 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L354

Added line #L354 was not covered by tests
home.welcome_image_url, byte_stream=welcome_image, session=web_session
)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

Expand Down Expand Up @@ -437,7 +445,11 @@
for asset_url in asset_urls:
try:
asset_content = BytesIO()
stream_file(asset_url.value, byte_stream=asset_content)
stream_file(

Check warning on line 448 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L448

Added line #L448 was not covered by tests
asset_url.value,
byte_stream=asset_content,
session=web_session,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
Expand Down Expand Up @@ -474,7 +486,7 @@
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
css_buffer = BytesIO()
stream_file(css_location, byte_stream=css_buffer)
stream_file(css_location, byte_stream=css_buffer, session=web_session)

Check warning on line 489 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L489

Added line #L489 was not covered by tests
css_content = css_buffer.getvalue()
url_rewriter = CssUrlsRewriter(
article_url=HttpUrl(css_location),
Expand Down Expand Up @@ -548,7 +560,9 @@
try:
logger.debug(f"Downloading {icon_url} illustration")
illustration_content = BytesIO()
stream_file(icon_url, byte_stream=illustration_content)
stream_file(

Check warning on line 563 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L563

Added line #L563 was not covered by tests
icon_url, byte_stream=illustration_content, session=web_session
)
illustration_format = format_for(
illustration_content, from_suffix=False
)
Expand Down
5 changes: 2 additions & 3 deletions scraper/src/mindtouch2zim/vimeo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import requests

from mindtouch2zim.constants import (
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)


Expand All @@ -14,7 +13,7 @@

def get_vimeo_thumbnail_url(video_url: str) -> str:
"""From a vimeo URL - player or normal - retrieve corresponding thumbnail URL"""
resp = requests.get(
resp = web_session.get(

Check warning on line 16 in scraper/src/mindtouch2zim/vimeo.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/vimeo.py#L16

Added line #L16 was not covered by tests
f"https://vimeo.com/api/oembed.json?url={video_url}",
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
Expand Down