Skip to content

Commit

Permalink
Merge pull request #40 from openzim/stats_progress
Browse files Browse the repository at this point in the history
Add stats progress in log and in JSON file for Zimfarm
  • Loading branch information
benoit74 authored Oct 25, 2024
2 parents c2a6444 + a55f69b commit e03c789
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
7 changes: 7 additions & 0 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ def main(tmpdir: str) -> None:
default=False,
)

parser.add_argument(
"--stats-filename",
help="Path to store the progress JSON file to.",
dest="stats_filename",
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
Expand Down Expand Up @@ -238,6 +244,7 @@ def main(tmpdir: str) -> None:
output_folder=Path(args.output_folder),
zimui_dist=Path(args.zimui_dist),
content_filter=doc_filter,
stats_file=Path(args.stats_filename) if args.stats_filename else None,
overwrite_existing_zim=args.overwrite,
).run()
except SystemExit:
Expand Down
55 changes: 53 additions & 2 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import argparse
import datetime
import json
import re
from io import BytesIO
from pathlib import Path

from pydantic import BaseModel
from requests.exceptions import HTTPError
from schedule import every, run_pending
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
Expand Down Expand Up @@ -171,6 +173,7 @@ def __init__(
content_filter: ContentFilter,
output_folder: Path,
zimui_dist: Path,
stats_file: Path | None,
*,
overwrite_existing_zim: bool,
) -> None:
Expand All @@ -182,15 +185,24 @@ def __init__(
content_filter: User supplied filter selecting with content to convert.
output_folder: Directory to write ZIMs into.
zimui_dist: Build directory where Vite placed compiled Vue.JS frontend.
stats_file: Path where JSON task progress while be saved.
overwrite_existing_zim: Do not fail if ZIM already exists, overwrite it.
"""
self.mindtouch_client = mindtouch_client
self.zim_config = zim_config
self.content_filter = content_filter
self.output_folder = output_folder
self.zimui_dist = zimui_dist
self.stats_file = stats_file
self.overwrite_existing_zim = overwrite_existing_zim

self.stats_items_done = 0
# we add 1 more items to process so that progress is not 100% at the beginning
# when we do not yet know how many items we have to process and so that we can
# increase counter at the beginning of every for loop, not minding about what
# could happen in the loop in terms of exit conditions
self.stats_items_total = 1

self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)
Expand All @@ -213,6 +225,12 @@ def run(self) -> Path:
"""
logger.info("Generating ZIM")

# create first progress report and and a timer to update every 10 seconds
self._report_progress()
every(10).seconds.do( # pyright: ignore[reportUnknownMemberType]
self._report_progress
)

formatted_config = self.zim_config.format(
{
"name": self.zim_config.name,
Expand Down Expand Up @@ -274,8 +292,14 @@ def run(self) -> Path:
).model_dump_json(by_alias=True),
)

logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
count_zimui_files = len(list(self.zimui_dist.rglob("*")))
logger.info(
f"Adding {count_zimui_files} Vue.JS UI files in {self.zimui_dist}"
)
self.stats_items_total += count_zimui_files
for file in self.zimui_dist.rglob("*"):
self.stats_items_done += 1
run_pending()
if file.is_dir():
continue
path = str(Path(file).relative_to(self.zimui_dist))
Expand All @@ -301,8 +325,12 @@ def run(self) -> Path:
)

mathjax = (Path(__file__) / "../mathjax").resolve()
logger.info(f"Adding MathJax files in {mathjax}")
count_mathjax_files = len(list(mathjax.rglob("*")))
self.stats_items_total += count_mathjax_files
logger.info(f"Adding {count_mathjax_files} MathJax files in {mathjax}")
for file in mathjax.rglob("*"):
self.stats_items_done += 1
run_pending()
if not file.is_file():
continue
path = str(Path(file).relative_to(mathjax.parent))
Expand Down Expand Up @@ -363,19 +391,25 @@ def run(self) -> Path:
logger.info("Fetching pages content")
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)
existing_html_pages = {
ArticleUrlRewriter.normalize(
HttpUrl(f"{self.mindtouch_client.library_url}/{page.path}")
)
for page in selected_pages
}
for page in selected_pages:
self.stats_items_done += 1
run_pending()
self._process_page(
creator=creator, page=page, existing_zim_paths=existing_html_pages
)

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
self.stats_items_total += len(self.items_to_download)
for asset_path, asset_urls in self.items_to_download.items():
self.stats_items_done += 1
run_pending()
for asset_url in asset_urls:
try:
asset_content = BytesIO()
Expand All @@ -395,6 +429,11 @@ def run(self) -> Path:
# missing
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

# same reason than self.stats_items_done = 1 at the beginning, we need to add
# a final item to complete the progress
self.stats_items_done += 1
self._report_progress()

return zim_path

def _process_css(
Expand Down Expand Up @@ -465,6 +504,18 @@ def _process_page(
),
)

def _report_progress(self):
"""report progress to stats file"""

logger.info(f" Progress {self.stats_items_done} / {self.stats_items_total}")
if not self.stats_file:
return
progress = {
"done": self.stats_items_done,
"total": self.stats_items_total,
}
self.stats_file.write_text(json.dumps(progress, indent=2))


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down

0 comments on commit e03c789

Please sign in to comment.