From dfe4246c66c45795f451324f368d645865d4ee73 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 26 Nov 2024 14:56:30 +0000
Subject: [PATCH] Catch all exception on bad asset processing, and log full
 details about asset

---
 scraper/src/mindtouch2zim/asset.py          | 57 ++++++++++-----------
 scraper/src/mindtouch2zim/entrypoint.py     |  5 +-
 scraper/src/mindtouch2zim/html_rewriting.py |  2 +-
 scraper/src/mindtouch2zim/processor.py      | 31 +++++++----
 4 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
index b598db7..b73f7a5 100644
--- a/scraper/src/mindtouch2zim/asset.py
+++ b/scraper/src/mindtouch2zim/asset.py
@@ -43,9 +43,15 @@ class HeaderData(NamedTuple):
 
 
 class AssetDetails(NamedTuple):
-    urls: set[HttpUrl]
+    asset_urls: set[HttpUrl]
+    used_by: set[str]
     always_fetch_online: bool
 
+    def used_by_str(self) -> str:
+        if len(self.used_by) == 0:
+            return ""
+        return f' used by {", ".join(self.used_by)}'
+
 
 class AssetProcessor:
 
@@ -62,55 +68,47 @@ def process_asset(
         asset_details: AssetDetails,
         creator: Creator,
     ):
-        logger.debug(f"Processing asset for {asset_path}")
-        CONTEXT.processing_step = f"processing asset {asset_path}"
-        self._process_asset_internal(
-            asset_path=asset_path, asset_details=asset_details, creator=creator
-        )
-
-    def _process_asset_internal(
-        self,
-        asset_path: ZimPath,
-        asset_details: AssetDetails,
-        creator: Creator,
-    ):
-        for asset_url in asset_details.urls:
+        """Download and add to the ZIM a given asset (image, ...)"""
+        for asset_url in asset_details.asset_urls:
             try:
+                CONTEXT.processing_step = (
+                    f"asset from {asset_url.value}{asset_details.used_by_str()}"
+                )
+                logger.debug(f"Processing {CONTEXT.processing_step}")
                 asset_content = self.get_asset_content(
                     asset_path=asset_path,
                     asset_url=asset_url,
                     always_fetch_online=asset_details.always_fetch_online,
                 )
-                logger.debug(
-                    f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
-                )
+                logger.debug(f"Adding asset to {asset_path.value} in the ZIM")
                 creator.add_item_for(
                     path="content/" + asset_path.value,
                     content=asset_content.getvalue(),
                 )
                 break  # file found and added
             except KnownBadAssetFailedError as exc:
-                logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
-            except RequestException as exc:
+                logger.debug(f"Ignoring known bad asset: {exc}")
+            except Exception as exc:
+                # all other exceptions (not only RequestsException) lead to an increase
+                # of bad_assets_count, because we have no idea what could go wrong here
+                # and limit and bad assets threshold should be correct in production,
+                # or ignored at own user risk
                 with self.lock:
                     self.bad_assets_count += 1
+                    log_message = (
+                        f"Exception while processing {CONTEXT.processing_step}: {exc}"
+                    )
                     if (
                         CONTEXT.bad_assets_threshold >= 0
                         and self.bad_assets_count > CONTEXT.bad_assets_threshold
                     ):
-                        logger.error(
-                            f"Exception while processing asset for {asset_url.value}: "
-                            f"{exc}"
-                        )
+                        logger.error(log_message)
                         raise Exception(  # noqa: B904
                             f"Asset failure threshold ({CONTEXT.bad_assets_threshold}) "
                             "reached, stopping execution"
                         )
                     else:
-                        logger.warning(
-                            f"Exception while processing asset for {asset_url.value}: "
-                            f"{exc}"
-                        )
+                        logger.warning(log_message)
 
     def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
         """Get details from headers for a given url
@@ -244,7 +242,8 @@ def get_asset_content(
                         )
                     else:
                         logger.debug(
-                            f"Not optimizing, unsupported mime type: {mime_type}"
+                            f"Not optimizing, unsupported mime type: {mime_type} for "
+                            f"{CONTEXT.processing_step}"
                         )
 
             return self._download_from_online(asset_url=asset_url)
@@ -256,7 +255,7 @@ def get_asset_content(
             if CONTEXT.bad_assets_regex and CONTEXT.bad_assets_regex.findall(
                 asset_url.value
             ):
-                raise KnownBadAssetFailedError() from exc
+                raise KnownBadAssetFailedError(exc) from exc
             raise
 
     def _setup_s3(self):
diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py
index 5a3e258..d7c7261 100644
--- a/scraper/src/mindtouch2zim/entrypoint.py
+++ b/scraper/src/mindtouch2zim/entrypoint.py
@@ -259,5 +259,8 @@ def main(tmpdir: str) -> None:
         raise
     except Exception as exc:
         logger.exception(exc)
-        logger.error(f"Generation failed with the following error: {exc}")
+        logger.error(
+            "Generation failed with the following error while processing "
+            f"{CONTEXT.processing_step}: {exc}"
+        )
         raise SystemExit(1) from exc
diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py
index 071ab9a..2c7aaca 100644
--- a/scraper/src/mindtouch2zim/html_rewriting.py
+++ b/scraper/src/mindtouch2zim/html_rewriting.py
@@ -56,7 +56,7 @@ def rewrite_href_src_srcset_attributes(
         new_attr_value = ""
         logger.warning(
             f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
-            f"'{attr_value}') while {CONTEXT.processing_step}"
+            f"'{attr_value}') while rewriting {CONTEXT.processing_step}"
         )
     return (attr_name, new_attr_value)
 
diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
index 5788ec0..86f3922 100644
--- a/scraper/src/mindtouch2zim/processor.py
+++ b/scraper/src/mindtouch2zim/processor.py
@@ -255,7 +255,7 @@ def run(self) -> Path:
 
     def run_with_creator(self, creator: Creator):
 
-        CONTEXT.processing_step = "Storing standard files"
+        CONTEXT.processing_step = "standard files"
 
         logger.info("  Storing configuration...")
         creator.add_item_for(
@@ -339,7 +339,7 @@ def run_with_creator(self, creator: Creator):
         )
 
         logger.info("Fetching pages tree")
-        CONTEXT.processing_step = "Fetching pages tree"
+        CONTEXT.processing_step = "pages tree"
         pages_tree = self.mindtouch_client.get_page_tree()
         selected_pages = self.content_filter.filter(pages_tree)
         logger.info(
@@ -359,7 +359,7 @@ def run_with_creator(self, creator: Creator):
         )
 
         logger.info("Fetching pages content")
-        CONTEXT.processing_step = "Fetching pages content"
+        CONTEXT.processing_step = "pages content"
         # compute the list of existing pages to properly rewrite links leading
         # in-ZIM / out-of-ZIM
         self.stats_items_total += len(selected_pages)
@@ -398,7 +398,7 @@ def run_with_creator(self, creator: Creator):
         del private_pages
 
         logger.info(f"  Retrieving {len(self.items_to_download)} assets...")
-        CONTEXT.processing_step = "Processing assets"
+        CONTEXT.processing_step = "assets"
         self.stats_items_total += len(self.items_to_download)
 
         res = self.asset_executor(
@@ -427,6 +427,8 @@ def _process_css(
         """Process a given CSS stylesheet
         Download content if necessary, rewrite CSS and add CSS to ZIM
         """
+        CONTEXT.processing_step = f"CSS at {css_location}"
+        logger.debug(f"  Fetching {CONTEXT.processing_step}")
         if not css_location:
             raise ValueError(f"Cannot process empty css_location for {target_filename}")
         if not css_content:
@@ -447,10 +449,13 @@ def _process_css(
         # to use last URL encountered.
         for path, urls in url_rewriter.items_to_download.items():
             if path in self.items_to_download:
-                self.items_to_download[path].urls.update(urls)
+                self.items_to_download[path].asset_urls.update(urls)
+                self.items_to_download[path].used_by.add(CONTEXT.processing_step)
             else:
                 self.items_to_download[path] = AssetDetails(
-                    urls=urls, always_fetch_online=True
+                    asset_urls=urls,
+                    used_by={CONTEXT.processing_step},
+                    always_fetch_online=True,
                 )
         creator.add_item_for(f"content/{target_filename}", content=result)
 
@@ -466,8 +471,8 @@ def _process_page(
         """Process a given library page
         Download content, rewrite HTML and add JSON to ZIM
         """
-        logger.debug(f"  Fetching {page.id}")
-        CONTEXT.processing_step = f"processing page {page.id} at {page.encoded_url}"
+        CONTEXT.processing_step = f"page ID {page.id} ({page.encoded_url})"
+        logger.debug(f"  Fetching {CONTEXT.processing_step}")
         page_content = self.mindtouch_client.get_page_content(page)
         url_rewriter = HtmlUrlsRewriter(
             CONTEXT.library_url,
@@ -508,7 +513,7 @@ def _process_page(
                 # and since these pages are absolutely not essential, we just display a
                 # warning and store an empty page
                 logger.warning(
-                    f"Problem processing special page ID {page.id} ({page.encoded_url})"
+                    f"Problem processing special {CONTEXT.processing_step}"
                     f", page is probably empty, storing empty page: {exc}"
                 )
                 return ""
@@ -516,11 +521,15 @@ def _process_page(
             # Default rewriting for 'normal' pages
             rewriten = rewriter.rewrite(page_content.html_body).content
         for path, urls in url_rewriter.items_to_download.items():
+
             if path in self.items_to_download:
-                self.items_to_download[path].urls.update(urls)
+                self.items_to_download[path].asset_urls.update(urls)
+                self.items_to_download[path].used_by.add(CONTEXT.processing_step)
             else:
                 self.items_to_download[path] = AssetDetails(
-                    urls=urls, always_fetch_online=False
+                    asset_urls=urls,
+                    used_by={CONTEXT.processing_step},
+                    always_fetch_online=False,
                 )
         creator.add_item_for(
             f"content/page_content_{page.id}.json",