From 2f3d37e295e11ef037aa16d20f71c53cfda63f9b Mon Sep 17 00:00:00 2001
From: Ric Evans <19216225+ric-evans@users.noreply.github.com>
Date: Wed, 20 Nov 2024 18:08:29 -0600
Subject: [PATCH] Output File Updates; Add Wrapper Script [minor] (#3)

Co-authored-by: github-actions <github-actions@github.com>
---
 .github/workflows/wipac-cicd.yaml             |  94 +++++++++-
 pyproject.toml                                |   4 +-
 ...c_histos_tree.sh => cp-src-histos-tree.sh} |   5 +-
 resources/sample-each-dataset.sh              | 166 ++++++++++++++++++
 simprod_histogram/display_histos.py           |   2 +-
 ...le_dataset_histos.py => sample_dataset.py} | 119 ++++++++-----
 tests/unit/test_sample_dataset_histos.py      | 150 +++++++++++++---
 7 files changed, 465 insertions(+), 75 deletions(-)
 rename resources/{cp_src_histos_tree.sh => cp-src-histos-tree.sh} (97%)
 create mode 100755 resources/sample-each-dataset.sh
 rename simprod_histogram/{sample_dataset_histos.py => sample_dataset.py} (72%)

diff --git a/.github/workflows/wipac-cicd.yaml b/.github/workflows/wipac-cicd.yaml
index 85c1864..250e103 100644
--- a/.github/workflows/wipac-cicd.yaml
+++ b/.github/workflows/wipac-cicd.yaml
@@ -21,7 +21,8 @@ jobs:
           token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
       - uses: WIPACrepo/wipac-dev-py-setup-action@v4.2
         with:
-          python_min: 3.9
+          python_min: 3.11
+          python_max: 3.11
           pypi_name: icecube-simprod-histogram
           author: IceCube
           author_email: developers@icecube.wisc.edu
@@ -41,8 +42,6 @@ jobs:
       - uses: actions/checkout@v3
       - id: versions
         uses: WIPACrepo/wipac-dev-py-versions-action@v2.5
-        with:
-          range: ">=3.12"
 
   flake8:
     needs: [ py-versions ]
@@ -83,10 +82,12 @@ jobs:
       - uses: actions/setup-python@v4
       - name: Run Ruff for code formatting
         run: |
+          set -euo pipefail
           pip install ruff
           ruff check --select C408 --fix . --unsafe-fixes
       - name: Commit formatted code
         run: |
+          set -euo pipefail
           git config user.name github-actions
           git config user.email github-actions@github.com
           git add .
@@ -94,7 +95,7 @@ jobs:
           git push || true
 
   ###########################################################################
-  # UNIT TESTS
+  # TESTS
   ###########################################################################
 
   unit-tests:
@@ -109,19 +110,100 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.py3 }}
-      - run: |
+      - name: install
+        run: |
+          set -euo pipefail
           pip install .[tests]
       - name: Run unit tests
         run: |
+          set -euo pipefail
           pytest -vvv tests/unit/
 
+  test-wrapper-script:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        max_num_datasets:
+          - 1
+          - 25
+          - 100  # aka all of them, currently, there are 48
+        base_path:
+          - /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888
+          - /tmp/data/sim/IceCube/2023/filtered/CORSIKA
+          - /tmp/data/sim/Upgrade/2022/filtered
+          - /tmp/data/sim/IceCube/2023
+          - /tmp/data/sim/Upgrade
+          - /tmp/data/sim
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python environment
+        uses: actions/setup-python@v4
+      - name: Create a mock dataset structure
+        run: |
+          set -euo pipefail
+          job_range_dpaths=(
+            /tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55}
+          )
+
+          # Create directories and conditionally populate files
+          for dpath in "${job_range_dpaths[@]}"; do
+            mkdir -p "$dpath"/histos/
+            # create 1-5 pkl files
+            for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do
+              random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1)
+              cp "$random_file" "$dpath/histos/histo_$i.pkl"
+            done
+          done
+
+      - name: Look at filetree (before)
+        run: |
+          set -euo pipefail
+          tree /tmp/data/sim/
+
+      - name: Run script with matrix parameters
+        run: |
+          set -euo pipefail
+          set -x
+          ./resources/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }}
+
+      - name: Validate script execution
+        run: |
+          set -euo pipefail
+          echo "Max num of datasets: ${{ matrix.max_num_datasets }}"
+          
+          # Count dataset directories containing at least one "*.histo.hdf5" file
+          available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l)
+          echo "Available datasets: $available_datasets"
+
+          # Use the lesser of available_datasets and num_datasets for validation
+          expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} ))
+          echo "Expected datasets: $expected_num_datasets"
+
+          # Check processed count
+          processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l)
+          echo "Processed count: $processed_count"
+
+          if [[ $processed_count -ne $expected_num_datasets ]]; then
+            echo "Script did not process the expected number of datasets!"
+            exit 1
+          fi
+
+          echo "All tests passed."
+
+      - name: Look at filetree (after)
+        run: |
+          set -euo pipefail
+          tree /tmp/data/sim/
+
   ###########################################################################
   # RELEASE
   ###########################################################################
 
   release:
     if: github.ref == 'refs/heads/main'
-    needs: [ py-setup, flake8, mypy, code-format, unit-tests ]
+    needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-wrapper-script ]
     runs-on: ubuntu-latest
     concurrency: release  # prevent any possible race conditions
     steps:
diff --git a/pyproject.toml b/pyproject.toml
index 40336ce..f09b685 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,8 @@ name = "icecube-simprod-histogram"
 description = "Utilities for working with histograms created for simprod"
 readme = "README.md"
 keywords = ["histogram sampling", "simulation", "statistics"]
-classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13"]
-requires-python = ">=3.9, <3.14"
+classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.11"]
+requires-python = ">=3.11, <3.12"
 
 [[project.authors]]
 name = "IceCube"
diff --git a/resources/cp_src_histos_tree.sh b/resources/cp-src-histos-tree.sh
similarity index 97%
rename from resources/cp_src_histos_tree.sh
rename to resources/cp-src-histos-tree.sh
index 2104c43..c073a09 100755
--- a/resources/cp_src_histos_tree.sh
+++ b/resources/cp-src-histos-tree.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -euo pipefail
 
 ########################################################################################
 # Script Name: Simprod Histogram Sampler
@@ -8,14 +9,14 @@
 #              to a destination directory in the user's home directory. It also provides
 #              an option for a dry run to preview actions without making changes.
 #
-# Usage:       cp_src_histos_tree.sh <SOURCE_DIR> [--dryrun]
+# Usage:       cp-src-histos-tree.sh <SOURCE_DIR> [--dryrun]
 #
 # Parameters:
 #     SOURCE_DIR : The source directory containing the "*/histos" directories to sample from.
 #     --dryrun   : Optional flag that, if provided, skips actual file and directory operations,
 #                  outputting actions to be taken without modifying any files.
 #
-# Example:     cp_src_histos_tree.sh /path/to/source --dryrun
+# Example:     cp-src-histos-tree.sh /path/to/source --dryrun
 #
 # Notes:
 #     - Sampling percentages for directories and files are set to 10% by default.
diff --git a/resources/sample-each-dataset.sh b/resources/sample-each-dataset.sh
new file mode 100755
index 0000000..ca4fa5a
--- /dev/null
+++ b/resources/sample-each-dataset.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+set -euo pipefail
+
+#######################################################################################
+# This script automates the sampling of histograms from dataset directories. It takes
+# a base directory containing simulation datasets, a sample percentage for the histograms,
+# and the number of datasets to process. It scans each dataset directory to check for
+# existing histogram files and skips any datasets that have already been processed.
+#
+# Usage:
+#   ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>
+#
+# Arguments:
+#   <BASE_PATH>         - The root path under which all dataset directories are located.
+#                         Example paths:
+#                         /data/sim/IceCube/2023/generated/neutrino-generator/22645
+#                         /data/sim/IceCube/2023/generated/neutrino-generator/
+#                         /data/sim/IceCube/2023/generated/
+#                         /data/sim/IceCube/2023/
+#                         /data/sim/IceCube/
+#   <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample
+#   <MAX_NUM_DATASETS>      - Number of datasets to process in this run
+#
+# Requirements:
+# - Python 3
+#
+#######################################################################################
+
+# Check args
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>"
+    exit 1
+fi
+
+# set BASE_PATH -> scan all datasets under this path
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/
+# ex: /data/sim/IceCube/2023/generated/
+BASE_PATH=$1
+
+SAMPLE_PERCENTAGE=$2
+MAX_NUM_DATASETS=$3
+
+#######################################################################################
+# setup python virtual environment, install the package
+
+PYVENV="simprod-histogram-pyvenv"
+pip install virtualenv
+python -m virtualenv $PYVENV
+. $PYVENV/bin/activate &&
+    pip install --upgrade pip &&
+    pip install --no-cache-dir icecube-simprod-histogram
+
+#######################################################################################
+# pre-calculate depth-to-datasets arg for 'find'
+
+# like /data/sim/IceCube/<year>/<generated>/<neutrino-generator>/<dataset_id>
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 -> depth=0
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ -> depth=1
+# ex: /data/sim/IceCube/2023/generated/ -> depth=2
+depth_to_datasets=$(python3 -c "
+from pathlib import Path
+import sys
+
+path = Path(sys.argv[1])
+SIM = 'sim'
+N_SEGMENTS_BASE_TO_DATASET = 5
+
+try:
+    base_index = list(path.parts).index(SIM)
+except ValueError:
+    raise ValueError(f'Path {path} does not contain the base identifier {SIM}/')
+segments_after_base = path.parts[base_index + 1:]
+
+depth = N_SEGMENTS_BASE_TO_DATASET - len(segments_after_base)
+if depth < 0:
+    raise ValueError(f'Path {path} is too specific; the user can supply up to a dataset dir')
+print(depth)
+" "$BASE_PATH" 2>&1)
+
+#######################################################################################
+# Run!
+
+# Create a temporary file to track errors
+error_file=$(mktemp)
+echo "0" >"$error_file"
+# Create a temporary file to track count
+count_file=$(mktemp)
+echo "0" >"count_file"
+# and rm those files
+cleanup() {
+    rm -f "$error_file"
+    rm -f "$count_file"
+}
+trap cleanup EXIT
+trap cleanup ERR
+
+# other vars
+MAX_REACHED_CODE=2
+
+# Define a helper function to process each dataset
+process_dataset() {
+    local dataset_dir="$1"
+    local dest_dir="$dataset_dir"            # put it into the dataset directory
+    local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized)
+
+    # Stop processing if the specified number of datasets has been reached
+    if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then
+        return $MAX_REACHED_CODE # Signals to stop processing datasets
+    fi
+
+    # Check if this dataset has been processed previously
+    if find "$dest_dir" -maxdepth 1 -name "*.histo.hdf5" | read -r; then
+        echo "Skipping $dataset_dir, an output file with .histo.hdf5 extension already exists in $dest_dir."
+        return 0 # This is okay, proceed to the next dataset
+    fi
+
+    # Process the dataset
+    echo "Processing dataset: $dataset_dir"
+    local error_output
+    error_output=$(
+        python -m simprod_histogram.sample_dataset \
+            "$dataset_dir" \
+            --sample-percentage "$SAMPLE_PERCENTAGE" \
+            --dest-dir "$dest_dir" \
+            2>&1
+    )
+    local exit_status=$?
+
+    # Handle subprocess exit status
+    if [ "$exit_status" -ne 0 ]; then
+        if echo "$error_output" | grep -q "HistogramNotFoundError"; then
+            echo "Warning: HistogramNotFoundError for $dataset_dir, skipping."
+            return 0 # This is okay, proceed to the next dataset
+        else
+            echo "Error: Failed to process $dataset_dir" >&2
+            echo "$error_output" >&2
+            echo "1" >"$error_file" # Set error flag in the temporary file
+            return 1                # Error! Stop processing datasets
+        fi
+    else
+        echo "Successfully processed $dataset_dir"
+        echo "$((num_processed + 1))" >"$count_file"
+        return 0 # This is okay, proceed to the next dataset
+    fi
+}
+
+export -f process_dataset
+export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file
+
+# Use find with -exec to process each dataset and handle return codes
+find "$BASE_PATH" \
+    -mindepth "$depth_to_datasets" \
+    -maxdepth "$depth_to_datasets" \
+    -type d \
+    -exec bash -c 'process_dataset "$0"' {} \;
+
+# Check if any errors were flagged
+if [ "$(cat "$error_file")" -ne 0 ]; then
+    echo "Exiting with error (see above)." >&2
+    exit 1
+fi
+
+#######################################################################################
+
+echo "Done."
diff --git a/simprod_histogram/display_histos.py b/simprod_histogram/display_histos.py
index d20b1b5..9d49168 100644
--- a/simprod_histogram/display_histos.py
+++ b/simprod_histogram/display_histos.py
@@ -92,7 +92,7 @@ def main():
     parser.add_argument(
         "path",
         type=Path,
-        help="the dataset directory to grab pickled histograms",
+        help="the path to the histogram file (pickle, json, or hdf5)",
     )
     args = parser.parse_args()
 
diff --git a/simprod_histogram/sample_dataset_histos.py b/simprod_histogram/sample_dataset.py
similarity index 72%
rename from simprod_histogram/sample_dataset_histos.py
rename to simprod_histogram/sample_dataset.py
index 9fde551..655f284 100644
--- a/simprod_histogram/sample_dataset_histos.py
+++ b/simprod_histogram/sample_dataset.py
@@ -1,13 +1,12 @@
 """Aggregate the dataset's job's histograms by sampling."""
 
 import argparse
-import json
 import logging
 import math
 import pickle
 import random
 from pathlib import Path
-from typing import Iterator
+from typing import Any, Iterator
 
 import h5py  # type: ignore
 import numpy as np
@@ -31,18 +30,35 @@
 ]
 
 
-def get_job_histo_files(dataset_dir: Path, sample_percentage: float) -> Iterator[Path]:
+class HistogramNotFoundError(Exception):
+    """Raised when a histogram is not found."""
+
+
+def _sample_percentage(val: Any) -> float:
+    val = float(val)
+    if val <= 0.0 or val > 1.0:
+        raise ValueError(
+            "--sample-percentage must be between 0.0 (exclusive) and 1.0 (inclusive)"
+        )
+    return val
+
+
+def get_job_histo_files(dpath: Path, sample_percentage: float) -> Iterator[Path]:
     """Yield a sample of histogram files, each originating from a job."""
-    sample_percentage = max(0.0, min(sample_percentage, 1.0))
+    sample_percentage = _sample_percentage(sample_percentage)
+    histos_found = False
 
     # NOTE: we're randomly sampling evenly across all "job-range" subdirectories,
     #         this keeps memory down (iow, going dir-by-dir). However, it does
     #         mean the files are yielded in "job-range" order. This is fine for
     #         aggregating data.
 
-    for subdir in dataset_dir.glob("*/histos"):
+    for subdir in dpath.glob("*/histos"):
         histo_files = list(subdir.glob("*.pkl"))
         random.shuffle(histo_files)  # randomly sample
+        if not histos_found and histo_files:  # a neeeeed for speeeeed
+            histos_found = True
+
         sample_size = math.ceil(len(histo_files) * sample_percentage)  # int is floor
         logging.info(
             f"sampling {sample_percentage * 100:.1f}% of histograms in {subdir.name}"
@@ -50,6 +66,10 @@ def get_job_histo_files(dataset_dir: Path, sample_percentage: float) -> Iterator
         )
         yield from histo_files[:sample_size]
 
+    # did the glob produce any files?
+    if not histos_found:
+        raise HistogramNotFoundError(f"No histogram files found in {dpath}")
+
 
 def update_aggregation(existing: dict, new: dict) -> dict:
     """Incorporate the 'new' histogram with the existing aggregated histogram.
@@ -58,7 +78,7 @@ def update_aggregation(existing: dict, new: dict) -> dict:
     """
     if new["name"] != existing["name"]:
         logging.warning(
-            f"new histogram '{new["name"]}' does not match existing histogram '{existing['name']}'"
+            f"new histogram '{new['name']}' does not match existing histogram '{existing['name']}'"
         )
 
     def new_bin_values():
@@ -67,7 +87,7 @@ def new_bin_values():
         if len(existing["bin_values"]) != len(new["bin_values"]):
             raise ValueError(
                 f"'bin_values' list must have the same length: "
-                f"{existing["bin_values"]} + {new["bin_values"]}"
+                f"{existing['bin_values']} + {new['bin_values']}"
             )
         return [a + b for a, b in zip(existing["bin_values"], new["bin_values"])]
 
@@ -86,32 +106,11 @@ def new_bin_values():
     return existing
 
 
-def main() -> None:
-    """Do main."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "path",
-        type=Path,
-        help="the dataset directory to grab pickled histograms",
-    )
-    parser.add_argument(
-        "--sample-percentage",
-        type=float,
-        required=True,
-        help="the percentage of a dataset's histogram to be sampled (for each type)",
-    )
-    parser.add_argument(
-        "--dest-dir",
-        type=Path,
-        required=True,
-        help="the destination directory to write a file containing the dataset's sampled histograms",
-    )
-    args = parser.parse_args()
-
-    _main(args)
-
-
-def _main(args: argparse.Namespace) -> None:
+def sample_histograms(
+    dpath: Path,
+    sample_percentage: float,
+) -> dict[str, dict]:
+    """Assemble the sampled histograms from the dataset."""
     sampled_histos = {
         t: {
             "name": t,
@@ -122,13 +121,13 @@ def _main(args: argparse.Namespace) -> None:
             "nan_count": 0,
             "bin_values": [],
             "_sample_count": 0,
-            "_dataset_path": str(args.path.resolve()),
+            "_sample_percentage": sample_percentage,
+            "_dataset_path": str(dpath.resolve()),
         }
         for t in HISTO_TYPES
     }
 
-    # aggregate histograms into condensed samples (1 per type)
-    for job_file in get_job_histo_files(args.path, args.sample_percentage):
+    for i, job_file in enumerate(get_job_histo_files(dpath, sample_percentage)):
         with open(job_file, "rb") as f:
             contents = pickle.load(f)
             for histo_type in contents.keys():
@@ -150,13 +149,51 @@ def _main(args: argparse.Namespace) -> None:
             }
         )
 
+    return sampled_histos
+
+
+def main() -> None:
+    """Do main."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "path",
+        type=Path,
+        help="the dataset directory to grab pickled histograms",
+    )
+    parser.add_argument(
+        "--sample-percentage",
+        type=_sample_percentage,
+        required=True,
+        help="the percentage of a dataset's histogram to be sampled (for each type)",
+    )
+    parser.add_argument(
+        "--dest-dir",
+        type=Path,
+        required=True,
+        help="the destination directory to write a file containing the dataset's sampled histograms",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        default=False,
+        help="force writing the output histogram even if it would overwrite an existing one.",
+    )
+    args = parser.parse_args()
+
+    _main(args)
+
+
+def _main(args: argparse.Namespace) -> None:
+    outfile = args.dest_dir / f"{args.path.name}.histo.hdf5"
+    if not args.force and outfile.exists():
+        raise FileExistsError(f"{outfile} already exists")
+
+    # aggregate histograms into condensed samples (1 per type)
+    sampled_histos = sample_histograms(args.path, args.sample_percentage)
+
     #
     # write out sampled (averaged) histos
-    # -> json
-    with open(args.dest_dir / f"{args.path.name}.json", "w") as f:
-        json.dump(sampled_histos, f)  # don't indent
-    # -> hdf5
-    with h5py.File(args.dest_dir / f"{args.path.name}.hdf5", "w") as f:
+    with h5py.File(outfile, "w") as f:
         for histo_type, histo in sampled_histos.items():
             group = f.create_group(histo_type)
             for k, v in histo.items():
diff --git a/tests/unit/test_sample_dataset_histos.py b/tests/unit/test_sample_dataset_histos.py
index e36a93b..abce9fa 100644
--- a/tests/unit/test_sample_dataset_histos.py
+++ b/tests/unit/test_sample_dataset_histos.py
@@ -1,8 +1,8 @@
-"""Tests for sample_dataset_histos.py"""
+"""Tests for sample_dataset.py"""
 
 import argparse
-import json
 import pickle
+import re
 import sys
 import tempfile
 from pathlib import Path
@@ -14,14 +14,16 @@
 project_root = Path(__file__).resolve().parents[2]
 sys.path.insert(0, str(project_root))
 
-from simprod_histogram.sample_dataset_histos import (  # noqa: E402
+from simprod_histogram.sample_dataset import (  # noqa: E402
     _main,
     get_job_histo_files,
     update_aggregation,
+    HistogramNotFoundError,
 )
 
 
 def test_100__get_job_histo_files_sampling():
+    """Test sampling of histogram files with varying sample percentages."""
     # Create a temporary dataset directory with histogram files
     with tempfile.TemporaryDirectory() as tempdir:
         dataset_dir = Path(tempdir)
@@ -40,12 +42,35 @@ def test_100__get_job_histo_files_sampling():
         sampled_files = list(get_job_histo_files(dataset_dir, sample_percentage=1.0))
         assert len(sampled_files) == 10  # Should sample all 10 files
 
-        # Sample 0%
-        sampled_files = list(get_job_histo_files(dataset_dir, sample_percentage=0.0))
-        assert len(sampled_files) == 0  # Should sample none
+        # Sample 0% -> error
+        with pytest.raises(
+            ValueError,
+            match=re.escape(
+                "--sample-percentage must be between 0.0 (exclusive) and 1.0 (inclusive)"
+            ),
+        ):
+            list(get_job_histo_files(dataset_dir, sample_percentage=0.0))
+
+
+def test_110__get_job_histo_files_no_histograms():
+    """Test that HistogramNotFoundError is raised when no histogram files are found."""
+    # Create a temporary dataset directory without any histogram files
+    with tempfile.TemporaryDirectory() as tempdir:
+        dataset_dir = Path(tempdir)
+        subdir = dataset_dir / "job1/histos"
+        subdir.mkdir(parents=True)
+
+        # No histogram files are created in this directory structure
+
+        # Expect HistogramNotFoundError because there are no histogram files
+        with pytest.raises(
+            HistogramNotFoundError, match=f"No histogram files found in {dataset_dir}"
+        ):
+            list(get_job_histo_files(dataset_dir, sample_percentage=0.5))
 
 
 def test_200__update_aggregation_matching_histogram():
+    """Test updating histogram aggregation with matching histogram types."""
     existing = {
         "name": "PrimaryEnergy",
         "xmin": 0.0,
@@ -76,6 +101,7 @@ def test_200__update_aggregation_matching_histogram():
 
 
 def test_210__update_aggregation_histogram_length_mismatch():
+    """Test that ValueError is raised for bin length mismatch in aggregation."""
     existing = {
         "name": "PrimaryEnergy",
         "xmin": 0.0,
@@ -103,6 +129,7 @@ def test_210__update_aggregation_histogram_length_mismatch():
 
 
 def test_300__aggregate_histograms():
+    """Test aggregation of histograms and output to HDF5 format."""
     # Mock some sample histograms and an output directory
     sample_histograms = {
         "PrimaryEnergy": {
@@ -127,27 +154,104 @@ def test_300__aggregate_histograms():
         with open(histo_file, "wb") as f:
             pickle.dump(sample_histograms, f)
 
-        # Prepare args
-        args = argparse.Namespace(
-            path=dataset_path,
-            sample_percentage=1.0,  # sample everything
-            dest_dir=output_dir,
+        # Run
+        _main(
+            args=argparse.Namespace(
+                path=dataset_path,
+                sample_percentage=1.0,  # sample everything
+                dest_dir=output_dir,
+                force=False,
+            )
         )
 
-        # Run main aggregation
-        _main(args=args)
-
         # Check output JSON and HDF5 files
-        json_file = output_dir / "sample_dataset.json"
-        assert json_file.exists()
-        with open(json_file, "r") as f:
-            data = json.load(f)
-            print(data)
-            assert "PrimaryEnergy" in data
-            assert data["PrimaryEnergy"]["bin_values"] == [10, 20, 30]
-
-        hdf5_file = output_dir / "sample_dataset.hdf5"
+        hdf5_file = output_dir / "sample_dataset.histo.hdf5"
         assert hdf5_file.exists()
         with h5py.File(hdf5_file, "r") as f:
             assert "PrimaryEnergy" in f
             assert list(f["PrimaryEnergy/bin_values"][:]) == [10, 20, 30]
+
+
+def test_310__aggregate_histograms_with_force():
+    """Test aggregation with force flag to overwrite existing HDF5 output."""
+    # Mock some sample histograms and an output directory
+    sample_histograms = {
+        "PrimaryEnergy": {
+            "name": "PrimaryEnergy",
+            "xmin": 0.0,
+            "xmax": 10.0,
+            "overflow": 0,
+            "underflow": 0,
+            "nan_count": 0,
+            "bin_values": [10, 20, 30],
+        }
+    }
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        output_dir = Path(tempdir)
+        dataset_path = output_dir / "sample_dataset"
+        dataset_path.mkdir(parents=True)
+
+        # Save mock histogram to dataset
+        histo_file = dataset_path / "00000-00001/histos/0.pkl"
+        histo_file.parent.mkdir(parents=True)
+        with open(histo_file, "wb") as f:
+            pickle.dump(sample_histograms, f)
+
+        # Run main aggregation without --force (file should be created)
+        _main(
+            args=argparse.Namespace(
+                path=dataset_path,
+                sample_percentage=1.0,  # sample everything
+                dest_dir=output_dir,
+                force=False,  # Do not use the force flag
+            )
+        )
+
+        # Check output HDF5 file
+        hdf5_file = output_dir / "sample_dataset.histo.hdf5"
+        assert hdf5_file.exists()
+
+        # Modify the sample histograms for a different dataset
+        new_sample_histograms = {
+            "PrimaryEnergy": {
+                "name": "PrimaryEnergy",
+                "xmin": 1.0,
+                "xmax": 20.0,
+                "overflow": 1,
+                "underflow": 1,
+                "nan_count": 1,
+                "bin_values": [100, 200, 300],
+            }
+        }
+
+        # Overwrite the existing pickled file with new data
+        with open(histo_file, "wb") as f:
+            pickle.dump(new_sample_histograms, f)
+
+        # Try running again without --force; should raise an error
+        with pytest.raises(FileExistsError):
+            _main(
+                args=argparse.Namespace(
+                    path=dataset_path,
+                    sample_percentage=1.0,
+                    dest_dir=output_dir,
+                    force=False,
+                )
+            )
+
+        # Run again with --force to allow overwrite
+        _main(
+            args=argparse.Namespace(
+                path=dataset_path,
+                sample_percentage=1.0,
+                dest_dir=output_dir,
+                force=True,  # Enable force to overwrite
+            )
+        )
+
+        # Check that file was overwritten and contains new data
+        assert hdf5_file.exists()
+        with h5py.File(hdf5_file, "r") as f:
+            assert "PrimaryEnergy" in f
+            assert list(f["PrimaryEnergy/bin_values"][:]) == [100, 200, 300]