From dfe4246c66c45795f451324f368d645865d4ee73 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 26 Nov 2024 14:56:30 +0000 Subject: [PATCH] Catch all exception on bad asset processing, and log full details about asset --- scraper/src/mindtouch2zim/asset.py | 57 ++++++++++----------- scraper/src/mindtouch2zim/entrypoint.py | 5 +- scraper/src/mindtouch2zim/html_rewriting.py | 2 +- scraper/src/mindtouch2zim/processor.py | 31 +++++++---- 4 files changed, 53 insertions(+), 42 deletions(-) diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py index b598db7..b73f7a5 100644 --- a/scraper/src/mindtouch2zim/asset.py +++ b/scraper/src/mindtouch2zim/asset.py @@ -43,9 +43,15 @@ class HeaderData(NamedTuple): class AssetDetails(NamedTuple): - urls: set[HttpUrl] + asset_urls: set[HttpUrl] + used_by: set[str] always_fetch_online: bool + def used_by_str(self) -> str: + if len(self.used_by) == 0: + return "" + return f' used by {", ".join(self.used_by)}' + class AssetProcessor: @@ -62,55 +68,47 @@ def process_asset( asset_details: AssetDetails, creator: Creator, ): - logger.debug(f"Processing asset for {asset_path}") - CONTEXT.processing_step = f"processing asset {asset_path}" - self._process_asset_internal( - asset_path=asset_path, asset_details=asset_details, creator=creator - ) - - def _process_asset_internal( - self, - asset_path: ZimPath, - asset_details: AssetDetails, - creator: Creator, - ): - for asset_url in asset_details.urls: + """Download and add to the ZIM a given asset (image, ...)""" + for asset_url in asset_details.asset_urls: try: + CONTEXT.processing_step = ( + f"asset from {asset_url.value}{asset_details.used_by_str()}" + ) + logger.debug(f"Processing {CONTEXT.processing_step}") asset_content = self.get_asset_content( asset_path=asset_path, asset_url=asset_url, always_fetch_online=asset_details.always_fetch_online, ) - logger.debug( - f"Adding {asset_url.value} to {asset_path.value} in the ZIM" - ) + logger.debug(f"Adding asset to {asset_path.value} in the ZIM") creator.add_item_for( path="content/" + asset_path.value, content=asset_content.getvalue(), ) break # file found and added except KnownBadAssetFailedError as exc: - logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}") - except RequestException as exc: + logger.debug(f"Ignoring known bad asset: {exc}") + except Exception as exc: + # all other exceptions (not only RequestsException) lead to an increase + # of bad_assets_count, because we have no idea what could go wrong here + # and limit and bad assets threshold should be correct in production, + # or ignored at own user risk with self.lock: self.bad_assets_count += 1 + log_message = ( + f"Exception while processing {CONTEXT.processing_step}: {exc}" + ) if ( CONTEXT.bad_assets_threshold >= 0 and self.bad_assets_count > CONTEXT.bad_assets_threshold ): - logger.error( - f"Exception while processing asset for {asset_url.value}: " - f"{exc}" - ) + logger.error(log_message) raise Exception( # noqa: B904 f"Asset failure threshold ({CONTEXT.bad_assets_threshold}) " "reached, stopping execution" ) else: - logger.warning( - f"Exception while processing asset for {asset_url.value}: " - f"{exc}" - ) + logger.warning(log_message) def _get_header_data_for(self, url: HttpUrl) -> HeaderData: """Get details from headers for a given url @@ -244,7 +242,8 @@ def get_asset_content( ) else: logger.debug( - f"Not optimizing, unsupported mime type: {mime_type}" + f"Not optimizing, unsupported mime type: {mime_type} for " + f"{CONTEXT.processing_step}" ) return self._download_from_online(asset_url=asset_url) @@ -256,7 +255,7 @@ def get_asset_content( if CONTEXT.bad_assets_regex and CONTEXT.bad_assets_regex.findall( asset_url.value ): - raise KnownBadAssetFailedError() from exc + raise KnownBadAssetFailedError(exc) from exc raise def _setup_s3(self): diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py index 5a3e258..d7c7261 100644 --- a/scraper/src/mindtouch2zim/entrypoint.py +++ b/scraper/src/mindtouch2zim/entrypoint.py @@ -259,5 +259,8 @@ def main(tmpdir: str) -> None: raise except Exception as exc: logger.exception(exc) - logger.error(f"Generation failed with the following error: {exc}") + logger.error( + "Generation failed with the following error while processing " + f"{CONTEXT.processing_step}: {exc}" + ) raise SystemExit(1) from exc diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py index 071ab9a..2c7aaca 100644 --- a/scraper/src/mindtouch2zim/html_rewriting.py +++ b/scraper/src/mindtouch2zim/html_rewriting.py @@ -56,7 +56,7 @@ def rewrite_href_src_srcset_attributes( new_attr_value = "" logger.warning( f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: " - f"'{attr_value}') while {CONTEXT.processing_step}" + f"'{attr_value}') while rewriting {CONTEXT.processing_step}" ) return (attr_name, new_attr_value) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 5788ec0..86f3922 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -255,7 +255,7 @@ def run(self) -> Path: def run_with_creator(self, creator: Creator): - CONTEXT.processing_step = "Storing standard files" + CONTEXT.processing_step = "standard files" logger.info(" Storing configuration...") creator.add_item_for( @@ -339,7 +339,7 @@ def run_with_creator(self, creator: Creator): ) logger.info("Fetching pages tree") - CONTEXT.processing_step = "Fetching pages tree" + CONTEXT.processing_step = "pages tree" pages_tree = self.mindtouch_client.get_page_tree() selected_pages = self.content_filter.filter(pages_tree) logger.info( @@ -359,7 +359,7 @@ def run_with_creator(self, creator: Creator): ) logger.info("Fetching pages content") - CONTEXT.processing_step = "Fetching pages content" + CONTEXT.processing_step = "pages content" # compute the list of existing pages to properly rewrite links leading # in-ZIM / out-of-ZIM self.stats_items_total += len(selected_pages) @@ -398,7 +398,7 @@ def run_with_creator(self, creator: Creator): del private_pages logger.info(f" Retrieving {len(self.items_to_download)} assets...") - CONTEXT.processing_step = "Processing assets" + CONTEXT.processing_step = "assets" self.stats_items_total += len(self.items_to_download) res = self.asset_executor( @@ -427,6 +427,8 @@ def _process_css( """Process a given CSS stylesheet Download content if necessary, rewrite CSS and add CSS to ZIM """ + CONTEXT.processing_step = f"CSS at {css_location}" + logger.debug(f" Fetching {CONTEXT.processing_step}") if not css_location: raise ValueError(f"Cannot process empty css_location for {target_filename}") if not css_content: @@ -447,10 +449,13 @@ def _process_css( # to use last URL encountered. for path, urls in url_rewriter.items_to_download.items(): if path in self.items_to_download: - self.items_to_download[path].urls.update(urls) + self.items_to_download[path].asset_urls.update(urls) + self.items_to_download[path].used_by.add(CONTEXT.processing_step) else: self.items_to_download[path] = AssetDetails( - urls=urls, always_fetch_online=True + asset_urls=urls, + used_by={CONTEXT.processing_step}, + always_fetch_online=True, ) creator.add_item_for(f"content/{target_filename}", content=result) @@ -466,8 +471,8 @@ def _process_page( """Process a given library page Download content, rewrite HTML and add JSON to ZIM """ - logger.debug(f" Fetching {page.id}") - CONTEXT.processing_step = f"processing page {page.id} at {page.encoded_url}" + CONTEXT.processing_step = f"page ID {page.id} ({page.encoded_url})" + logger.debug(f" Fetching {CONTEXT.processing_step}") page_content = self.mindtouch_client.get_page_content(page) url_rewriter = HtmlUrlsRewriter( CONTEXT.library_url, @@ -508,7 +513,7 @@ def _process_page( # and since these pages are absolutely not essential, we just display a # warning and store an empty page logger.warning( - f"Problem processing special page ID {page.id} ({page.encoded_url})" + f"Problem processing special {CONTEXT.processing_step}" f", page is probably empty, storing empty page: {exc}" ) return "" @@ -516,11 +521,15 @@ def _process_page( # Default rewriting for 'normal' pages rewriten = rewriter.rewrite(page_content.html_body).content for path, urls in url_rewriter.items_to_download.items(): + if path in self.items_to_download: - self.items_to_download[path].urls.update(urls) + self.items_to_download[path].asset_urls.update(urls) + self.items_to_download[path].used_by.add(CONTEXT.processing_step) else: self.items_to_download[path] = AssetDetails( - urls=urls, always_fetch_online=False + asset_urls=urls, + used_by={CONTEXT.processing_step}, + always_fetch_online=False, ) creator.add_item_for( f"content/page_content_{page.id}.json",