Skip to content

Commit

Permalink
Catch all exception on bad asset processing, and log full details abo…
Browse files Browse the repository at this point in the history
…ut asset
  • Loading branch information
benoit74 committed Nov 26, 2024
1 parent e51c9cd commit dfe4246
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 42 deletions.
57 changes: 28 additions & 29 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,15 @@ class HeaderData(NamedTuple):


class AssetDetails(NamedTuple):
urls: set[HttpUrl]
asset_urls: set[HttpUrl]
used_by: set[str]
always_fetch_online: bool

def used_by_str(self) -> str:
if len(self.used_by) == 0:
return ""
return f' used by {", ".join(self.used_by)}'

Check warning on line 53 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L52-L53

Added lines #L52 - L53 were not covered by tests


class AssetProcessor:

Expand All @@ -62,55 +68,47 @@ def process_asset(
asset_details: AssetDetails,
creator: Creator,
):
logger.debug(f"Processing asset for {asset_path}")
CONTEXT.processing_step = f"processing asset {asset_path}"
self._process_asset_internal(
asset_path=asset_path, asset_details=asset_details, creator=creator
)

def _process_asset_internal(
self,
asset_path: ZimPath,
asset_details: AssetDetails,
creator: Creator,
):
for asset_url in asset_details.urls:
"""Download and add to the ZIM a given asset (image, ...)"""
for asset_url in asset_details.asset_urls:
try:
CONTEXT.processing_step = (

Check warning on line 74 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L74

Added line #L74 was not covered by tests
f"asset from {asset_url.value}{asset_details.used_by_str()}"
)
logger.debug(f"Processing {CONTEXT.processing_step}")

Check warning on line 77 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L77

Added line #L77 was not covered by tests
asset_content = self.get_asset_content(
asset_path=asset_path,
asset_url=asset_url,
always_fetch_online=asset_details.always_fetch_online,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
logger.debug(f"Adding asset to {asset_path.value} in the ZIM")

Check warning on line 83 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L83

Added line #L83 was not covered by tests
creator.add_item_for(
path="content/" + asset_path.value,
content=asset_content.getvalue(),
)
break # file found and added
except KnownBadAssetFailedError as exc:
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
except RequestException as exc:
logger.debug(f"Ignoring known bad asset: {exc}")
except Exception as exc:

Check warning on line 91 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L90-L91

Added lines #L90 - L91 were not covered by tests
# all other exceptions (not only RequestsException) lead to an increase
# of bad_assets_count, because we have no idea what could go wrong here
# and limit and bad assets threshold should be correct in production,
# or ignored at own user risk
with self.lock:
self.bad_assets_count += 1
log_message = (

Check warning on line 98 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L98

Added line #L98 was not covered by tests
f"Exception while processing {CONTEXT.processing_step}: {exc}"
)
if (
CONTEXT.bad_assets_threshold >= 0
and self.bad_assets_count > CONTEXT.bad_assets_threshold
):
logger.error(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.error(log_message)

Check warning on line 105 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L105

Added line #L105 was not covered by tests
raise Exception( # noqa: B904
f"Asset failure threshold ({CONTEXT.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
logger.warning(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.warning(log_message)

Check warning on line 111 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L111

Added line #L111 was not covered by tests

def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
"""Get details from headers for a given url
Expand Down Expand Up @@ -244,7 +242,8 @@ def get_asset_content(
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type}"
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{CONTEXT.processing_step}"
)

return self._download_from_online(asset_url=asset_url)
Expand All @@ -256,7 +255,7 @@ def get_asset_content(
if CONTEXT.bad_assets_regex and CONTEXT.bad_assets_regex.findall(
asset_url.value
):
raise KnownBadAssetFailedError() from exc
raise KnownBadAssetFailedError(exc) from exc

Check warning on line 258 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L258

Added line #L258 was not covered by tests
raise

def _setup_s3(self):
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,8 @@ def main(tmpdir: str) -> None:
raise
except Exception as exc:
logger.exception(exc)
logger.error(f"Generation failed with the following error: {exc}")
logger.error(

Check warning on line 262 in scraper/src/mindtouch2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/entrypoint.py#L262

Added line #L262 was not covered by tests
"Generation failed with the following error while processing "
f"{CONTEXT.processing_step}: {exc}"
)
raise SystemExit(1) from exc
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def rewrite_href_src_srcset_attributes(
new_attr_value = ""
logger.warning(
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while {CONTEXT.processing_step}"
f"'{attr_value}') while rewriting {CONTEXT.processing_step}"
)
return (attr_name, new_attr_value)

Expand Down
31 changes: 20 additions & 11 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def run(self) -> Path:

def run_with_creator(self, creator: Creator):

CONTEXT.processing_step = "Storing standard files"
CONTEXT.processing_step = "standard files"

Check warning on line 258 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L258

Added line #L258 was not covered by tests

logger.info(" Storing configuration...")
creator.add_item_for(
Expand Down Expand Up @@ -339,7 +339,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages tree")
CONTEXT.processing_step = "Fetching pages tree"
CONTEXT.processing_step = "pages tree"

Check warning on line 342 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L342

Added line #L342 was not covered by tests
pages_tree = self.mindtouch_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
Expand All @@ -359,7 +359,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages content")
CONTEXT.processing_step = "Fetching pages content"
CONTEXT.processing_step = "pages content"

Check warning on line 362 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L362

Added line #L362 was not covered by tests
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)
Expand Down Expand Up @@ -398,7 +398,7 @@ def run_with_creator(self, creator: Creator):
del private_pages

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
CONTEXT.processing_step = "Processing assets"
CONTEXT.processing_step = "assets"

Check warning on line 401 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L401

Added line #L401 was not covered by tests
self.stats_items_total += len(self.items_to_download)

res = self.asset_executor(
Expand Down Expand Up @@ -427,6 +427,8 @@ def _process_css(
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
CONTEXT.processing_step = f"CSS at {css_location}"
logger.debug(f" Fetching {CONTEXT.processing_step}")

Check warning on line 431 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L430-L431

Added lines #L430 - L431 were not covered by tests
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
Expand All @@ -447,10 +449,13 @@ def _process_css(
# to use last URL encountered.
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(CONTEXT.processing_step)

Check warning on line 453 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L452-L453

Added lines #L452 - L453 were not covered by tests
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=True
asset_urls=urls,
used_by={CONTEXT.processing_step},
always_fetch_online=True,
)
creator.add_item_for(f"content/{target_filename}", content=result)

Expand All @@ -466,8 +471,8 @@ def _process_page(
"""Process a given library page
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
CONTEXT.processing_step = f"processing page {page.id} at {page.encoded_url}"
CONTEXT.processing_step = f"page ID {page.id} ({page.encoded_url})"
logger.debug(f" Fetching {CONTEXT.processing_step}")

Check warning on line 475 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L474-L475

Added lines #L474 - L475 were not covered by tests
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
CONTEXT.library_url,
Expand Down Expand Up @@ -508,19 +513,23 @@ def _process_page(
# and since these pages are absolutely not essential, we just display a
# warning and store an empty page
logger.warning(
f"Problem processing special page ID {page.id} ({page.encoded_url})"
f"Problem processing special {CONTEXT.processing_step}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
for path, urls in url_rewriter.items_to_download.items():

if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(CONTEXT.processing_step)

Check warning on line 527 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L526-L527

Added lines #L526 - L527 were not covered by tests
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=False
asset_urls=urls,
used_by={CONTEXT.processing_step},
always_fetch_online=False,
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
Expand Down

0 comments on commit dfe4246

Please sign in to comment.