From 2f3d37e295e11ef037aa16d20f71c53cfda63f9b Mon Sep 17 00:00:00 2001 From: Ric Evans <19216225+ric-evans@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:08:29 -0600 Subject: [PATCH] Output File Updates; Add Wrapper Script [minor] (#3) Co-authored-by: github-actions --- .github/workflows/wipac-cicd.yaml | 94 +++++++++- pyproject.toml | 4 +- ...c_histos_tree.sh => cp-src-histos-tree.sh} | 5 +- resources/sample-each-dataset.sh | 166 ++++++++++++++++++ simprod_histogram/display_histos.py | 2 +- ...le_dataset_histos.py => sample_dataset.py} | 119 ++++++++----- tests/unit/test_sample_dataset_histos.py | 150 +++++++++++++--- 7 files changed, 465 insertions(+), 75 deletions(-) rename resources/{cp_src_histos_tree.sh => cp-src-histos-tree.sh} (97%) create mode 100755 resources/sample-each-dataset.sh rename simprod_histogram/{sample_dataset_histos.py => sample_dataset.py} (72%) diff --git a/.github/workflows/wipac-cicd.yaml b/.github/workflows/wipac-cicd.yaml index 85c1864..250e103 100644 --- a/.github/workflows/wipac-cicd.yaml +++ b/.github/workflows/wipac-cicd.yaml @@ -21,7 +21,8 @@ jobs: token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} - uses: WIPACrepo/wipac-dev-py-setup-action@v4.2 with: - python_min: 3.9 + python_min: 3.11 + python_max: 3.11 pypi_name: icecube-simprod-histogram author: IceCube author_email: developers@icecube.wisc.edu @@ -41,8 +42,6 @@ jobs: - uses: actions/checkout@v3 - id: versions uses: WIPACrepo/wipac-dev-py-versions-action@v2.5 - with: - range: ">=3.12" flake8: needs: [ py-versions ] @@ -83,10 +82,12 @@ jobs: - uses: actions/setup-python@v4 - name: Run Ruff for code formatting run: | + set -euo pipefail pip install ruff ruff check --select C408 --fix . --unsafe-fixes - name: Commit formatted code run: | + set -euo pipefail git config user.name github-actions git config user.email github-actions@github.com git add . @@ -94,7 +95,7 @@ jobs: git push || true ########################################################################### - # UNIT TESTS + # TESTS ########################################################################### unit-tests: @@ -109,19 +110,100 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ matrix.py3 }} - - run: | + - name: install + run: | + set -euo pipefail pip install .[tests] - name: Run unit tests run: | + set -euo pipefail pytest -vvv tests/unit/ + test-wrapper-script: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + max_num_datasets: + - 1 + - 25 + - 100 # aka all of them, currently, there are 48 + base_path: + - /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888 + - /tmp/data/sim/IceCube/2023/filtered/CORSIKA + - /tmp/data/sim/Upgrade/2022/filtered + - /tmp/data/sim/IceCube/2023 + - /tmp/data/sim/Upgrade + - /tmp/data/sim + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + - name: Create a mock dataset structure + run: | + set -euo pipefail + job_range_dpaths=( + /tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55} + ) + + # Create directories and conditionally populate files + for dpath in "${job_range_dpaths[@]}"; do + mkdir -p "$dpath"/histos/ + # create 1-5 pkl files + for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do + random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1) + cp "$random_file" "$dpath/histos/histo_$i.pkl" + done + done + + - name: Look at filetree (before) + run: | + set -euo pipefail + tree /tmp/data/sim/ + + - name: Run script with matrix parameters + run: | + set -euo pipefail + set -x + ./resources/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }} + + - name: Validate script execution + run: | + set -euo pipefail + echo "Max num of datasets: ${{ matrix.max_num_datasets }}" + + # Count dataset directories containing at least one "*.histo.hdf5" file + available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l) + echo "Available datasets: $available_datasets" + + # Use the lesser of available_datasets and num_datasets for validation + expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} )) + echo "Expected datasets: $expected_num_datasets" + + # Check processed count + processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l) + echo "Processed count: $processed_count" + + if [[ $processed_count -ne $expected_num_datasets ]]; then + echo "Script did not process the expected number of datasets!" + exit 1 + fi + + echo "All tests passed." + + - name: Look at filetree (after) + run: | + set -euo pipefail + tree /tmp/data/sim/ + ########################################################################### # RELEASE ########################################################################### release: if: github.ref == 'refs/heads/main' - needs: [ py-setup, flake8, mypy, code-format, unit-tests ] + needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-wrapper-script ] runs-on: ubuntu-latest concurrency: release # prevent any possible race conditions steps: diff --git a/pyproject.toml b/pyproject.toml index 40336ce..f09b685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,8 @@ name = "icecube-simprod-histogram" description = "Utilities for working with histograms created for simprod" readme = "README.md" keywords = ["histogram sampling", "simulation", "statistics"] -classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13"] -requires-python = ">=3.9, <3.14" +classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.11"] +requires-python = ">=3.11, <3.12" [[project.authors]] name = "IceCube" diff --git a/resources/cp_src_histos_tree.sh b/resources/cp-src-histos-tree.sh similarity index 97% rename from resources/cp_src_histos_tree.sh rename to resources/cp-src-histos-tree.sh index 2104c43..c073a09 100755 --- a/resources/cp_src_histos_tree.sh +++ b/resources/cp-src-histos-tree.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail ######################################################################################## # Script Name: Simprod Histogram Sampler @@ -8,14 +9,14 @@ # to a destination directory in the user's home directory. It also provides # an option for a dry run to preview actions without making changes. # -# Usage: cp_src_histos_tree.sh [--dryrun] +# Usage: cp-src-histos-tree.sh [--dryrun] # # Parameters: # SOURCE_DIR : The source directory containing the "*/histos" directories to sample from. # --dryrun : Optional flag that, if provided, skips actual file and directory operations, # outputting actions to be taken without modifying any files. # -# Example: cp_src_histos_tree.sh /path/to/source --dryrun +# Example: cp-src-histos-tree.sh /path/to/source --dryrun # # Notes: # - Sampling percentages for directories and files are set to 10% by default. diff --git a/resources/sample-each-dataset.sh b/resources/sample-each-dataset.sh new file mode 100755 index 0000000..ca4fa5a --- /dev/null +++ b/resources/sample-each-dataset.sh @@ -0,0 +1,166 @@ +#!/bin/bash +set -euo pipefail + +####################################################################################### +# This script automates the sampling of histograms from dataset directories. It takes +# a base directory containing simulation datasets, a sample percentage for the histograms, +# and the number of datasets to process. It scans each dataset directory to check for +# existing histogram files and skips any datasets that have already been processed. +# +# Usage: +# ./sample-each-dataset.sh +# +# Arguments: +# - The root path under which all dataset directories are located. +# Example paths: +# /data/sim/IceCube/2023/generated/neutrino-generator/22645 +# /data/sim/IceCube/2023/generated/neutrino-generator/ +# /data/sim/IceCube/2023/generated/ +# /data/sim/IceCube/2023/ +# /data/sim/IceCube/ +# - Percentage of a dataset's histograms to sample +# - Number of datasets to process in this run +# +# Requirements: +# - Python 3 +# +####################################################################################### + +# Check args +if [ "$#" -lt 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +# set BASE_PATH -> scan all datasets under this path +# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 +# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ +# ex: /data/sim/IceCube/2023/generated/ +BASE_PATH=$1 + +SAMPLE_PERCENTAGE=$2 +MAX_NUM_DATASETS=$3 + +####################################################################################### +# setup python virtual environment, install the package + +PYVENV="simprod-histogram-pyvenv" +pip install virtualenv +python -m virtualenv $PYVENV +. $PYVENV/bin/activate && + pip install --upgrade pip && + pip install --no-cache-dir icecube-simprod-histogram + +####################################################################################### +# pre-calculate depth-to-datasets arg for 'find' + +# like /data/sim/IceCube//// +# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 -> depth=0 +# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ -> depth=1 +# ex: /data/sim/IceCube/2023/generated/ -> depth=2 +depth_to_datasets=$(python3 -c " +from pathlib import Path +import sys + +path = Path(sys.argv[1]) +SIM = 'sim' +N_SEGMENTS_BASE_TO_DATASET = 5 + +try: + base_index = list(path.parts).index(SIM) +except ValueError: + raise ValueError(f'Path {path} does not contain the base identifier {SIM}/') +segments_after_base = path.parts[base_index + 1:] + +depth = N_SEGMENTS_BASE_TO_DATASET - len(segments_after_base) +if depth < 0: + raise ValueError(f'Path {path} is too specific; the user can supply up to a dataset dir') +print(depth) +" "$BASE_PATH" 2>&1) + +####################################################################################### +# Run! + +# Create a temporary file to track errors +error_file=$(mktemp) +echo "0" >"$error_file" +# Create a temporary file to track count +count_file=$(mktemp) +echo "0" >"count_file" +# and rm those files +cleanup() { + rm -f "$error_file" + rm -f "$count_file" +} +trap cleanup EXIT +trap cleanup ERR + +# other vars +MAX_REACHED_CODE=2 + +# Define a helper function to process each dataset +process_dataset() { + local dataset_dir="$1" + local dest_dir="$dataset_dir" # put it into the dataset directory + local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized) + + # Stop processing if the specified number of datasets has been reached + if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then + return $MAX_REACHED_CODE # Signals to stop processing datasets + fi + + # Check if this dataset has been processed previously + if find "$dest_dir" -maxdepth 1 -name "*.histo.hdf5" | read -r; then + echo "Skipping $dataset_dir, an output file with .histo.hdf5 extension already exists in $dest_dir." + return 0 # This is okay, proceed to the next dataset + fi + + # Process the dataset + echo "Processing dataset: $dataset_dir" + local error_output + error_output=$( + python -m simprod_histogram.sample_dataset \ + "$dataset_dir" \ + --sample-percentage "$SAMPLE_PERCENTAGE" \ + --dest-dir "$dest_dir" \ + 2>&1 + ) + local exit_status=$? + + # Handle subprocess exit status + if [ "$exit_status" -ne 0 ]; then + if echo "$error_output" | grep -q "HistogramNotFoundError"; then + echo "Warning: HistogramNotFoundError for $dataset_dir, skipping." + return 0 # This is okay, proceed to the next dataset + else + echo "Error: Failed to process $dataset_dir" >&2 + echo "$error_output" >&2 + echo "1" >"$error_file" # Set error flag in the temporary file + return 1 # Error! Stop processing datasets + fi + else + echo "Successfully processed $dataset_dir" + echo "$((num_processed + 1))" >"$count_file" + return 0 # This is okay, proceed to the next dataset + fi +} + +export -f process_dataset +export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file + +# Use find with -exec to process each dataset and handle return codes +find "$BASE_PATH" \ + -mindepth "$depth_to_datasets" \ + -maxdepth "$depth_to_datasets" \ + -type d \ + -exec bash -c 'process_dataset "$0"' {} \; + +# Check if any errors were flagged +if [ "$(cat "$error_file")" -ne 0 ]; then + echo "Exiting with error (see above)." >&2 + exit 1 +fi + +####################################################################################### + +echo "Done." diff --git a/simprod_histogram/display_histos.py b/simprod_histogram/display_histos.py index d20b1b5..9d49168 100644 --- a/simprod_histogram/display_histos.py +++ b/simprod_histogram/display_histos.py @@ -92,7 +92,7 @@ def main(): parser.add_argument( "path", type=Path, - help="the dataset directory to grab pickled histograms", + help="the path to the histogram file (pickle, json, or hdf5)", ) args = parser.parse_args() diff --git a/simprod_histogram/sample_dataset_histos.py b/simprod_histogram/sample_dataset.py similarity index 72% rename from simprod_histogram/sample_dataset_histos.py rename to simprod_histogram/sample_dataset.py index 9fde551..655f284 100644 --- a/simprod_histogram/sample_dataset_histos.py +++ b/simprod_histogram/sample_dataset.py @@ -1,13 +1,12 @@ """Aggregate the dataset's job's histograms by sampling.""" import argparse -import json import logging import math import pickle import random from pathlib import Path -from typing import Iterator +from typing import Any, Iterator import h5py # type: ignore import numpy as np @@ -31,18 +30,35 @@ ] -def get_job_histo_files(dataset_dir: Path, sample_percentage: float) -> Iterator[Path]: +class HistogramNotFoundError(Exception): + """Raised when a histogram is not found.""" + + +def _sample_percentage(val: Any) -> float: + val = float(val) + if val <= 0.0 or val > 1.0: + raise ValueError( + "--sample-percentage must be between 0.0 (exclusive) and 1.0 (inclusive)" + ) + return val + + +def get_job_histo_files(dpath: Path, sample_percentage: float) -> Iterator[Path]: """Yield a sample of histogram files, each originating from a job.""" - sample_percentage = max(0.0, min(sample_percentage, 1.0)) + sample_percentage = _sample_percentage(sample_percentage) + histos_found = False # NOTE: we're randomly sampling evenly across all "job-range" subdirectories, # this keeps memory down (iow, going dir-by-dir). However, it does # mean the files are yielded in "job-range" order. This is fine for # aggregating data. - for subdir in dataset_dir.glob("*/histos"): + for subdir in dpath.glob("*/histos"): histo_files = list(subdir.glob("*.pkl")) random.shuffle(histo_files) # randomly sample + if not histos_found and histo_files: # a neeeeed for speeeeed + histos_found = True + sample_size = math.ceil(len(histo_files) * sample_percentage) # int is floor logging.info( f"sampling {sample_percentage * 100:.1f}% of histograms in {subdir.name}" @@ -50,6 +66,10 @@ def get_job_histo_files(dataset_dir: Path, sample_percentage: float) -> Iterator ) yield from histo_files[:sample_size] + # did the glob produce any files? + if not histos_found: + raise HistogramNotFoundError(f"No histogram files found in {dpath}") + def update_aggregation(existing: dict, new: dict) -> dict: """Incorporate the 'new' histogram with the existing aggregated histogram. @@ -58,7 +78,7 @@ def update_aggregation(existing: dict, new: dict) -> dict: """ if new["name"] != existing["name"]: logging.warning( - f"new histogram '{new["name"]}' does not match existing histogram '{existing['name']}'" + f"new histogram '{new['name']}' does not match existing histogram '{existing['name']}'" ) def new_bin_values(): @@ -67,7 +87,7 @@ def new_bin_values(): if len(existing["bin_values"]) != len(new["bin_values"]): raise ValueError( f"'bin_values' list must have the same length: " - f"{existing["bin_values"]} + {new["bin_values"]}" + f"{existing['bin_values']} + {new['bin_values']}" ) return [a + b for a, b in zip(existing["bin_values"], new["bin_values"])] @@ -86,32 +106,11 @@ def new_bin_values(): return existing -def main() -> None: - """Do main.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "path", - type=Path, - help="the dataset directory to grab pickled histograms", - ) - parser.add_argument( - "--sample-percentage", - type=float, - required=True, - help="the percentage of a dataset's histogram to be sampled (for each type)", - ) - parser.add_argument( - "--dest-dir", - type=Path, - required=True, - help="the destination directory to write a file containing the dataset's sampled histograms", - ) - args = parser.parse_args() - - _main(args) - - -def _main(args: argparse.Namespace) -> None: +def sample_histograms( + dpath: Path, + sample_percentage: float, +) -> dict[str, dict]: + """Assemble the sampled histograms from the dataset.""" sampled_histos = { t: { "name": t, @@ -122,13 +121,13 @@ def _main(args: argparse.Namespace) -> None: "nan_count": 0, "bin_values": [], "_sample_count": 0, - "_dataset_path": str(args.path.resolve()), + "_sample_percentage": sample_percentage, + "_dataset_path": str(dpath.resolve()), } for t in HISTO_TYPES } - # aggregate histograms into condensed samples (1 per type) - for job_file in get_job_histo_files(args.path, args.sample_percentage): + for i, job_file in enumerate(get_job_histo_files(dpath, sample_percentage)): with open(job_file, "rb") as f: contents = pickle.load(f) for histo_type in contents.keys(): @@ -150,13 +149,51 @@ def _main(args: argparse.Namespace) -> None: } ) + return sampled_histos + + +def main() -> None: + """Do main.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "path", + type=Path, + help="the dataset directory to grab pickled histograms", + ) + parser.add_argument( + "--sample-percentage", + type=_sample_percentage, + required=True, + help="the percentage of a dataset's histogram to be sampled (for each type)", + ) + parser.add_argument( + "--dest-dir", + type=Path, + required=True, + help="the destination directory to write a file containing the dataset's sampled histograms", + ) + parser.add_argument( + "--force", + action="store_true", + default=False, + help="force writing the output histogram even if it would overwrite an existing one.", + ) + args = parser.parse_args() + + _main(args) + + +def _main(args: argparse.Namespace) -> None: + outfile = args.dest_dir / f"{args.path.name}.histo.hdf5" + if not args.force and outfile.exists(): + raise FileExistsError(f"{outfile} already exists") + + # aggregate histograms into condensed samples (1 per type) + sampled_histos = sample_histograms(args.path, args.sample_percentage) + # # write out sampled (averaged) histos - # -> json - with open(args.dest_dir / f"{args.path.name}.json", "w") as f: - json.dump(sampled_histos, f) # don't indent - # -> hdf5 - with h5py.File(args.dest_dir / f"{args.path.name}.hdf5", "w") as f: + with h5py.File(outfile, "w") as f: for histo_type, histo in sampled_histos.items(): group = f.create_group(histo_type) for k, v in histo.items(): diff --git a/tests/unit/test_sample_dataset_histos.py b/tests/unit/test_sample_dataset_histos.py index e36a93b..abce9fa 100644 --- a/tests/unit/test_sample_dataset_histos.py +++ b/tests/unit/test_sample_dataset_histos.py @@ -1,8 +1,8 @@ -"""Tests for sample_dataset_histos.py""" +"""Tests for sample_dataset.py""" import argparse -import json import pickle +import re import sys import tempfile from pathlib import Path @@ -14,14 +14,16 @@ project_root = Path(__file__).resolve().parents[2] sys.path.insert(0, str(project_root)) -from simprod_histogram.sample_dataset_histos import ( # noqa: E402 +from simprod_histogram.sample_dataset import ( # noqa: E402 _main, get_job_histo_files, update_aggregation, + HistogramNotFoundError, ) def test_100__get_job_histo_files_sampling(): + """Test sampling of histogram files with varying sample percentages.""" # Create a temporary dataset directory with histogram files with tempfile.TemporaryDirectory() as tempdir: dataset_dir = Path(tempdir) @@ -40,12 +42,35 @@ def test_100__get_job_histo_files_sampling(): sampled_files = list(get_job_histo_files(dataset_dir, sample_percentage=1.0)) assert len(sampled_files) == 10 # Should sample all 10 files - # Sample 0% - sampled_files = list(get_job_histo_files(dataset_dir, sample_percentage=0.0)) - assert len(sampled_files) == 0 # Should sample none + # Sample 0% -> error + with pytest.raises( + ValueError, + match=re.escape( + "--sample-percentage must be between 0.0 (exclusive) and 1.0 (inclusive)" + ), + ): + list(get_job_histo_files(dataset_dir, sample_percentage=0.0)) + + +def test_110__get_job_histo_files_no_histograms(): + """Test that HistogramNotFoundError is raised when no histogram files are found.""" + # Create a temporary dataset directory without any histogram files + with tempfile.TemporaryDirectory() as tempdir: + dataset_dir = Path(tempdir) + subdir = dataset_dir / "job1/histos" + subdir.mkdir(parents=True) + + # No histogram files are created in this directory structure + + # Expect HistogramNotFoundError because there are no histogram files + with pytest.raises( + HistogramNotFoundError, match=f"No histogram files found in {dataset_dir}" + ): + list(get_job_histo_files(dataset_dir, sample_percentage=0.5)) def test_200__update_aggregation_matching_histogram(): + """Test updating histogram aggregation with matching histogram types.""" existing = { "name": "PrimaryEnergy", "xmin": 0.0, @@ -76,6 +101,7 @@ def test_200__update_aggregation_matching_histogram(): def test_210__update_aggregation_histogram_length_mismatch(): + """Test that ValueError is raised for bin length mismatch in aggregation.""" existing = { "name": "PrimaryEnergy", "xmin": 0.0, @@ -103,6 +129,7 @@ def test_210__update_aggregation_histogram_length_mismatch(): def test_300__aggregate_histograms(): + """Test aggregation of histograms and output to HDF5 format.""" # Mock some sample histograms and an output directory sample_histograms = { "PrimaryEnergy": { @@ -127,27 +154,104 @@ def test_300__aggregate_histograms(): with open(histo_file, "wb") as f: pickle.dump(sample_histograms, f) - # Prepare args - args = argparse.Namespace( - path=dataset_path, - sample_percentage=1.0, # sample everything - dest_dir=output_dir, + # Run + _main( + args=argparse.Namespace( + path=dataset_path, + sample_percentage=1.0, # sample everything + dest_dir=output_dir, + force=False, + ) ) - # Run main aggregation - _main(args=args) - # Check output JSON and HDF5 files - json_file = output_dir / "sample_dataset.json" - assert json_file.exists() - with open(json_file, "r") as f: - data = json.load(f) - print(data) - assert "PrimaryEnergy" in data - assert data["PrimaryEnergy"]["bin_values"] == [10, 20, 30] - - hdf5_file = output_dir / "sample_dataset.hdf5" + hdf5_file = output_dir / "sample_dataset.histo.hdf5" assert hdf5_file.exists() with h5py.File(hdf5_file, "r") as f: assert "PrimaryEnergy" in f assert list(f["PrimaryEnergy/bin_values"][:]) == [10, 20, 30] + + +def test_310__aggregate_histograms_with_force(): + """Test aggregation with force flag to overwrite existing HDF5 output.""" + # Mock some sample histograms and an output directory + sample_histograms = { + "PrimaryEnergy": { + "name": "PrimaryEnergy", + "xmin": 0.0, + "xmax": 10.0, + "overflow": 0, + "underflow": 0, + "nan_count": 0, + "bin_values": [10, 20, 30], + } + } + + with tempfile.TemporaryDirectory() as tempdir: + output_dir = Path(tempdir) + dataset_path = output_dir / "sample_dataset" + dataset_path.mkdir(parents=True) + + # Save mock histogram to dataset + histo_file = dataset_path / "00000-00001/histos/0.pkl" + histo_file.parent.mkdir(parents=True) + with open(histo_file, "wb") as f: + pickle.dump(sample_histograms, f) + + # Run main aggregation without --force (file should be created) + _main( + args=argparse.Namespace( + path=dataset_path, + sample_percentage=1.0, # sample everything + dest_dir=output_dir, + force=False, # Do not use the force flag + ) + ) + + # Check output HDF5 file + hdf5_file = output_dir / "sample_dataset.histo.hdf5" + assert hdf5_file.exists() + + # Modify the sample histograms for a different dataset + new_sample_histograms = { + "PrimaryEnergy": { + "name": "PrimaryEnergy", + "xmin": 1.0, + "xmax": 20.0, + "overflow": 1, + "underflow": 1, + "nan_count": 1, + "bin_values": [100, 200, 300], + } + } + + # Overwrite the existing pickled file with new data + with open(histo_file, "wb") as f: + pickle.dump(new_sample_histograms, f) + + # Try running again without --force; should raise an error + with pytest.raises(FileExistsError): + _main( + args=argparse.Namespace( + path=dataset_path, + sample_percentage=1.0, + dest_dir=output_dir, + force=False, + ) + ) + + # Run again with --force to allow overwrite + _main( + args=argparse.Namespace( + path=dataset_path, + sample_percentage=1.0, + dest_dir=output_dir, + force=True, # Enable force to overwrite + ) + ) + + # Check that file was overwritten and contains new data + assert hdf5_file.exists() + with h5py.File(hdf5_file, "r") as f: + assert "PrimaryEnergy" in f + assert list(f["PrimaryEnergy/bin_values"][:]) == [100, 200, 300]