From a407ae61919b7cb9dbc7d883299b3b852333feaf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 08:42:54 +0000 Subject: [PATCH] Add stats progress in log and in JSON file for Zimfarm --- scraper/src/mindtouch2zim/entrypoint.py | 7 ++++ scraper/src/mindtouch2zim/processor.py | 49 ++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py index e5535ff..146fd4a 100644 --- a/scraper/src/mindtouch2zim/entrypoint.py +++ b/scraper/src/mindtouch2zim/entrypoint.py @@ -206,6 +206,12 @@ def main(tmpdir: str) -> None: default=False, ) + parser.add_argument( + "--stats-filename", + help="Path to store the progress JSON file to.", + dest="stats_filename", + ) + args = parser.parse_args() logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO) @@ -240,6 +246,7 @@ def main(tmpdir: str) -> None: output_folder=Path(args.output_folder), zimui_dist=Path(args.zimui_dist), content_filter=doc_filter, + stats_file=Path(args.stats_filename) if args.stats_filename else None, overwrite_existing_zim=args.overwrite, ).run() except SystemExit: diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 0704a50..2a1592f 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -1,11 +1,13 @@ import argparse import datetime +import json import re from io import BytesIO from pathlib import Path from pydantic import BaseModel from requests.exceptions import HTTPError +from schedule import every, run_pending from zimscraperlib.download import ( stream_file, # pyright: ignore[reportUnknownVariableType] ) @@ -171,6 +173,7 @@ def __init__( content_filter: ContentFilter, output_folder: Path, zimui_dist: Path, + stats_file: Path | None, *, overwrite_existing_zim: bool, ) -> None: @@ -182,6 +185,7 @@ def __init__( content_filter: User supplied filter selecting with content to convert. output_folder: Directory to write ZIMs into. zimui_dist: Build directory where Vite placed compiled Vue.JS frontend. + stats_file: Path where JSON task progress while be saved. overwrite_existing_zim: Do not fail if ZIM already exists, overwrite it. """ self.mindtouch_client = mindtouch_client @@ -189,8 +193,12 @@ def __init__( self.content_filter = content_filter self.output_folder = output_folder self.zimui_dist = zimui_dist + self.stats_file = stats_file self.overwrite_existing_zim = overwrite_existing_zim + self.stats_items_done = 0 + self.stats_items_total = 1 + self.zim_illustration_path = self.libretexts_newsite_path( "header_logo_mini.png" ) @@ -213,6 +221,12 @@ def run(self) -> Path: """ logger.info("Generating ZIM") + # create first progress report and and a timer to update every 10 seconds + self._report_progress() + every(10).seconds.do( # pyright: ignore[reportUnknownMemberType] + self._report_progress + ) + formatted_config = self.zim_config.format( { "name": self.zim_config.name, @@ -274,8 +288,14 @@ def run(self) -> Path: ).model_dump_json(by_alias=True), ) - logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}") + count_zimui_files = len(list(self.zimui_dist.rglob("*"))) + logger.info( + f"Adding {count_zimui_files} Vue.JS UI files in {self.zimui_dist}" + ) + self.stats_items_total += count_zimui_files for file in self.zimui_dist.rglob("*"): + self.stats_items_done += 1 + run_pending() if file.is_dir(): continue path = str(Path(file).relative_to(self.zimui_dist)) @@ -301,8 +321,12 @@ def run(self) -> Path: ) mathjax = (Path(__file__) / "../mathjax").resolve() - logger.info(f"Adding MathJax files in {mathjax}") + count_mathjax_files = len(list(mathjax.rglob("*"))) + self.stats_items_total += count_mathjax_files + logger.info(f"Adding {count_mathjax_files} MathJax files in {mathjax}") for file in mathjax.rglob("*"): + self.stats_items_done += 1 + run_pending() if not file.is_file(): continue path = str(Path(file).relative_to(mathjax.parent)) @@ -363,6 +387,7 @@ def run(self) -> Path: logger.info("Fetching pages content") # compute the list of existing pages to properly rewrite links leading # in-ZIM / out-of-ZIM + self.stats_items_total += len(selected_pages) existing_html_pages = { ArticleUrlRewriter.normalize( HttpUrl(f"{self.mindtouch_client.library_url}/{page.path}") @@ -370,12 +395,17 @@ def run(self) -> Path: for page in selected_pages } for page in selected_pages: + self.stats_items_done += 1 + run_pending() self._process_page( creator=creator, page=page, existing_zim_paths=existing_html_pages ) logger.info(f" Retrieving {len(self.items_to_download)} assets...") + self.stats_items_total += len(self.items_to_download) for asset_path, asset_urls in self.items_to_download.items(): + self.stats_items_done += 1 + run_pending() for asset_url in asset_urls: try: asset_content = BytesIO() @@ -395,6 +425,9 @@ def run(self) -> Path: # missing logger.debug(f"Ignoring {asset_path.value} due to {exc}") + self.stats_items_done += 1 + self._report_progress() + return zim_path def _process_css( @@ -465,6 +498,18 @@ def _process_page( ), ) + def _report_progress(self): + """report progress to stats file""" + + logger.info(f" Progress {self.stats_items_done} / {self.stats_items_total}") + if not self.stats_file: + return + progress = { + "done": self.stats_items_done, + "total": self.stats_items_total, + } + self.stats_file.write_text(json.dumps(progress, indent=2)) + # remove all standard rules, they are not adapted to Vue.JS UI html_rules.rewrite_attribute_rules.clear()