Skip to content

Commit

Permalink
Add stats progress in log and in JSON file for Zimfarm
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 25, 2024
1 parent 2cee587 commit a407ae6
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 2 deletions.
7 changes: 7 additions & 0 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ def main(tmpdir: str) -> None:
default=False,
)

parser.add_argument(

Check warning on line 209 in scraper/src/mindtouch2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/entrypoint.py#L209

Added line #L209 was not covered by tests
"--stats-filename",
help="Path to store the progress JSON file to.",
dest="stats_filename",
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
Expand Down Expand Up @@ -240,6 +246,7 @@ def main(tmpdir: str) -> None:
output_folder=Path(args.output_folder),
zimui_dist=Path(args.zimui_dist),
content_filter=doc_filter,
stats_file=Path(args.stats_filename) if args.stats_filename else None,
overwrite_existing_zim=args.overwrite,
).run()
except SystemExit:
Expand Down
49 changes: 47 additions & 2 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import argparse
import datetime
import json
import re
from io import BytesIO
from pathlib import Path

from pydantic import BaseModel
from requests.exceptions import HTTPError
from schedule import every, run_pending
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
Expand Down Expand Up @@ -171,6 +173,7 @@ def __init__(
content_filter: ContentFilter,
output_folder: Path,
zimui_dist: Path,
stats_file: Path | None,
*,
overwrite_existing_zim: bool,
) -> None:
Expand All @@ -182,15 +185,20 @@ def __init__(
content_filter: User supplied filter selecting with content to convert.
output_folder: Directory to write ZIMs into.
zimui_dist: Build directory where Vite placed compiled Vue.JS frontend.
stats_file: Path where JSON task progress while be saved.
overwrite_existing_zim: Do not fail if ZIM already exists, overwrite it.
"""
self.mindtouch_client = mindtouch_client
self.zim_config = zim_config
self.content_filter = content_filter
self.output_folder = output_folder
self.zimui_dist = zimui_dist
self.stats_file = stats_file

Check warning on line 196 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L196

Added line #L196 was not covered by tests
self.overwrite_existing_zim = overwrite_existing_zim

self.stats_items_done = 0
self.stats_items_total = 1

Check warning on line 200 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L199-L200

Added lines #L199 - L200 were not covered by tests

self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)
Expand All @@ -213,6 +221,12 @@ def run(self) -> Path:
"""
logger.info("Generating ZIM")

# create first progress report and and a timer to update every 10 seconds
self._report_progress()
every(10).seconds.do( # pyright: ignore[reportUnknownMemberType]

Check warning on line 226 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L225-L226

Added lines #L225 - L226 were not covered by tests
self._report_progress
)

formatted_config = self.zim_config.format(
{
"name": self.zim_config.name,
Expand Down Expand Up @@ -274,8 +288,14 @@ def run(self) -> Path:
).model_dump_json(by_alias=True),
)

logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
count_zimui_files = len(list(self.zimui_dist.rglob("*")))
logger.info(

Check warning on line 292 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L291-L292

Added lines #L291 - L292 were not covered by tests
f"Adding {count_zimui_files} Vue.JS UI files in {self.zimui_dist}"
)
self.stats_items_total += count_zimui_files

Check warning on line 295 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L295

Added line #L295 was not covered by tests
for file in self.zimui_dist.rglob("*"):
self.stats_items_done += 1
run_pending()

Check warning on line 298 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L297-L298

Added lines #L297 - L298 were not covered by tests
if file.is_dir():
continue
path = str(Path(file).relative_to(self.zimui_dist))
Expand All @@ -301,8 +321,12 @@ def run(self) -> Path:
)

mathjax = (Path(__file__) / "../mathjax").resolve()
logger.info(f"Adding MathJax files in {mathjax}")
count_mathjax_files = len(list(mathjax.rglob("*")))
self.stats_items_total += count_mathjax_files
logger.info(f"Adding {count_mathjax_files} MathJax files in {mathjax}")

Check warning on line 326 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L324-L326

Added lines #L324 - L326 were not covered by tests
for file in mathjax.rglob("*"):
self.stats_items_done += 1
run_pending()

Check warning on line 329 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L328-L329

Added lines #L328 - L329 were not covered by tests
if not file.is_file():
continue
path = str(Path(file).relative_to(mathjax.parent))
Expand Down Expand Up @@ -363,19 +387,25 @@ def run(self) -> Path:
logger.info("Fetching pages content")
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)

Check warning on line 390 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L390

Added line #L390 was not covered by tests
existing_html_pages = {
ArticleUrlRewriter.normalize(
HttpUrl(f"{self.mindtouch_client.library_url}/{page.path}")
)
for page in selected_pages
}
for page in selected_pages:
self.stats_items_done += 1
run_pending()

Check warning on line 399 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L398-L399

Added lines #L398 - L399 were not covered by tests
self._process_page(
creator=creator, page=page, existing_zim_paths=existing_html_pages
)

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
self.stats_items_total += len(self.items_to_download)

Check warning on line 405 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L405

Added line #L405 was not covered by tests
for asset_path, asset_urls in self.items_to_download.items():
self.stats_items_done += 1
run_pending()

Check warning on line 408 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L407-L408

Added lines #L407 - L408 were not covered by tests
for asset_url in asset_urls:
try:
asset_content = BytesIO()
Expand All @@ -395,6 +425,9 @@ def run(self) -> Path:
# missing
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

self.stats_items_done += 1
self._report_progress()

Check warning on line 429 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L428-L429

Added lines #L428 - L429 were not covered by tests

return zim_path

def _process_css(
Expand Down Expand Up @@ -465,6 +498,18 @@ def _process_page(
),
)

def _report_progress(self):
"""report progress to stats file"""

logger.info(f" Progress {self.stats_items_done} / {self.stats_items_total}")

Check warning on line 504 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L504

Added line #L504 was not covered by tests
if not self.stats_file:
return
progress = {

Check warning on line 507 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L506-L507

Added lines #L506 - L507 were not covered by tests
"done": self.stats_items_done,
"total": self.stats_items_total,
}
self.stats_file.write_text(json.dumps(progress, indent=2))

Check warning on line 511 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L511

Added line #L511 was not covered by tests


# remove all standard rules, they are not adapted to Vue.JS UI
html_rules.rewrite_attribute_rules.clear()
Expand Down

0 comments on commit a407ae6

Please sign in to comment.