Output File Updates; Add Wrapper Script [minor] (#3)

Co-authored-by: github-actions <[email protected]>
icecube · Nov 21, 2024 · 2f3d37e · 2f3d37e
1 parent 3be274f
commit 2f3d37e
Show file tree

Hide file tree

Showing 7 changed files with 465 additions and 75 deletions.
diff --git a/.github/workflows/wipac-cicd.yaml b/.github/workflows/wipac-cicd.yaml
@@ -21,7 +21,8 @@ jobs:
           token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
       - uses: WIPACrepo/[email protected]
         with:
-          python_min: 3.9
+          python_min: 3.11
+          python_max: 3.11
           pypi_name: icecube-simprod-histogram
           author: IceCube
           author_email: [email protected]
@@ -41,8 +42,6 @@ jobs:
       - uses: actions/checkout@v3
       - id: versions
         uses: WIPACrepo/[email protected]
-        with:
-          range: ">=3.12"
 
   flake8:
     needs: [ py-versions ]
@@ -83,18 +82,20 @@ jobs:
       - uses: actions/setup-python@v4
       - name: Run Ruff for code formatting
         run: |
+          set -euo pipefail
           pip install ruff
           ruff check --select C408 --fix . --unsafe-fixes
       - name: Commit formatted code
         run: |
+          set -euo pipefail
           git config user.name github-actions
           git config user.email [email protected]
           git add .
           git commit -m "<bot> auto code format file(s)" || true
           git push || true
 
   ###########################################################################
-  # UNIT TESTS
+  # TESTS
   ###########################################################################
 
   unit-tests:
@@ -109,19 +110,100 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.py3 }}
-      - run: |
+      - name: install
+        run: |
+          set -euo pipefail
           pip install .[tests]
       - name: Run unit tests
         run: |
+          set -euo pipefail
           pytest -vvv tests/unit/
 
+  test-wrapper-script:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        max_num_datasets:
+          - 1
+          - 25
+          - 100  # aka all of them, currently, there are 48
+        base_path:
+          - /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888
+          - /tmp/data/sim/IceCube/2023/filtered/CORSIKA
+          - /tmp/data/sim/Upgrade/2022/filtered
+          - /tmp/data/sim/IceCube/2023
+          - /tmp/data/sim/Upgrade
+          - /tmp/data/sim
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python environment
+        uses: actions/setup-python@v4
+      - name: Create a mock dataset structure
+        run: |
+          set -euo pipefail
+          job_range_dpaths=(
+            /tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55}
+          )
+
+          # Create directories and conditionally populate files
+          for dpath in "${job_range_dpaths[@]}"; do
+            mkdir -p "$dpath"/histos/
+            # create 1-5 pkl files
+            for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do
+              random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1)
+              cp "$random_file" "$dpath/histos/histo_$i.pkl"
+            done
+          done
+
+      - name: Look at filetree (before)
+        run: |
+          set -euo pipefail
+          tree /tmp/data/sim/
+
+      - name: Run script with matrix parameters
+        run: |
+          set -euo pipefail
+          set -x
+          ./resources/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }}
+
+      - name: Validate script execution
+        run: |
+          set -euo pipefail
+          echo "Max num of datasets: ${{ matrix.max_num_datasets }}"
+          
+          # Count dataset directories containing at least one "*.histo.hdf5" file
+          available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l)
+          echo "Available datasets: $available_datasets"
+
+          # Use the lesser of available_datasets and num_datasets for validation
+          expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} ))
+          echo "Expected datasets: $expected_num_datasets"
+
+          # Check processed count
+          processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l)
+          echo "Processed count: $processed_count"
+
+          if [[ $processed_count -ne $expected_num_datasets ]]; then
+            echo "Script did not process the expected number of datasets!"
+            exit 1
+          fi
+
+          echo "All tests passed."
+
+      - name: Look at filetree (after)
+        run: |
+          set -euo pipefail
+          tree /tmp/data/sim/
+
   ###########################################################################
   # RELEASE
   ###########################################################################
 
   release:
     if: github.ref == 'refs/heads/main'
-    needs: [ py-setup, flake8, mypy, code-format, unit-tests ]
+    needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-wrapper-script ]
     runs-on: ubuntu-latest
     concurrency: release  # prevent any possible race conditions
     steps:

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,8 +9,8 @@ name = "icecube-simprod-histogram"
 description = "Utilities for working with histograms created for simprod"
 readme = "README.md"
 keywords = ["histogram sampling", "simulation", "statistics"]
-classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13"]
-requires-python = ">=3.9, <3.14"
+classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.11"]
+requires-python = ">=3.11, <3.12"
 
 [[project.authors]]
 name = "IceCube"

diff --git a/resources/cp_src_histos_tree.sh → resources/cp-src-histos-tree.sh b/resources/cp_src_histos_tree.sh → resources/cp-src-histos-tree.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -euo pipefail
 
 ########################################################################################
 # Script Name: Simprod Histogram Sampler
@@ -8,14 +9,14 @@
 #              to a destination directory in the user's home directory. It also provides
 #              an option for a dry run to preview actions without making changes.
 #
-# Usage:       cp_src_histos_tree.sh <SOURCE_DIR> [--dryrun]
+# Usage:       cp-src-histos-tree.sh <SOURCE_DIR> [--dryrun]
 #
 # Parameters:
 #     SOURCE_DIR : The source directory containing the "*/histos" directories to sample from.
 #     --dryrun   : Optional flag that, if provided, skips actual file and directory operations,
 #                  outputting actions to be taken without modifying any files.
 #
-# Example:     cp_src_histos_tree.sh /path/to/source --dryrun
+# Example:     cp-src-histos-tree.sh /path/to/source --dryrun
 #
 # Notes:
 #     - Sampling percentages for directories and files are set to 10% by default.

diff --git a/resources/sample-each-dataset.sh b/resources/sample-each-dataset.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+set -euo pipefail
+
+#######################################################################################
+# This script automates the sampling of histograms from dataset directories. It takes
+# a base directory containing simulation datasets, a sample percentage for the histograms,
+# and the number of datasets to process. It scans each dataset directory to check for
+# existing histogram files and skips any datasets that have already been processed.
+#
+# Usage:
+#   ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>
+#
+# Arguments:
+#   <BASE_PATH>         - The root path under which all dataset directories are located.
+#                         Example paths:
+#                         /data/sim/IceCube/2023/generated/neutrino-generator/22645
+#                         /data/sim/IceCube/2023/generated/neutrino-generator/
+#                         /data/sim/IceCube/2023/generated/
+#                         /data/sim/IceCube/2023/
+#                         /data/sim/IceCube/
+#   <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample
+#   <MAX_NUM_DATASETS>      - Number of datasets to process in this run
+#
+# Requirements:
+# - Python 3
+#
+#######################################################################################
+
+# Check args
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>"
+    exit 1
+fi
+
+# set BASE_PATH -> scan all datasets under this path
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/
+# ex: /data/sim/IceCube/2023/generated/
+BASE_PATH=$1
+
+SAMPLE_PERCENTAGE=$2
+MAX_NUM_DATASETS=$3
+
+#######################################################################################
+# setup python virtual environment, install the package
+
+PYVENV="simprod-histogram-pyvenv"
+pip install virtualenv
+python -m virtualenv $PYVENV
+. $PYVENV/bin/activate &&
+    pip install --upgrade pip &&
+    pip install --no-cache-dir icecube-simprod-histogram
+
+#######################################################################################
+# pre-calculate depth-to-datasets arg for 'find'
+
+# like /data/sim/IceCube/<year>/<generated>/<neutrino-generator>/<dataset_id>
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 -> depth=0
+# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ -> depth=1
+# ex: /data/sim/IceCube/2023/generated/ -> depth=2
+depth_to_datasets=$(python3 -c "
+from pathlib import Path
+import sys
+
+path = Path(sys.argv[1])
+SIM = 'sim'
+N_SEGMENTS_BASE_TO_DATASET = 5
+
+try:
+    base_index = list(path.parts).index(SIM)
+except ValueError:
+    raise ValueError(f'Path {path} does not contain the base identifier {SIM}/')
+segments_after_base = path.parts[base_index + 1:]
+
+depth = N_SEGMENTS_BASE_TO_DATASET - len(segments_after_base)
+if depth < 0:
+    raise ValueError(f'Path {path} is too specific; the user can supply up to a dataset dir')
+print(depth)
+" "$BASE_PATH" 2>&1)
+
+#######################################################################################
+# Run!
+
+# Create a temporary file to track errors
+error_file=$(mktemp)
+echo "0" >"$error_file"
+# Create a temporary file to track count
+count_file=$(mktemp)
+echo "0" >"count_file"
+# and rm those files
+cleanup() {
+    rm -f "$error_file"
+    rm -f "$count_file"
+}
+trap cleanup EXIT
+trap cleanup ERR
+
+# other vars
+MAX_REACHED_CODE=2
+
+# Define a helper function to process each dataset
+process_dataset() {
+    local dataset_dir="$1"
+    local dest_dir="$dataset_dir"            # put it into the dataset directory
+    local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized)
+
+    # Stop processing if the specified number of datasets has been reached
+    if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then
+        return $MAX_REACHED_CODE # Signals to stop processing datasets
+    fi
+
+    # Check if this dataset has been processed previously
+    if find "$dest_dir" -maxdepth 1 -name "*.histo.hdf5" | read -r; then
+        echo "Skipping $dataset_dir, an output file with .histo.hdf5 extension already exists in $dest_dir."
+        return 0 # This is okay, proceed to the next dataset
+    fi
+
+    # Process the dataset
+    echo "Processing dataset: $dataset_dir"
+    local error_output
+    error_output=$(
+        python -m simprod_histogram.sample_dataset \
+            "$dataset_dir" \
+            --sample-percentage "$SAMPLE_PERCENTAGE" \
+            --dest-dir "$dest_dir" \
+            2>&1
+    )
+    local exit_status=$?
+
+    # Handle subprocess exit status
+    if [ "$exit_status" -ne 0 ]; then
+        if echo "$error_output" | grep -q "HistogramNotFoundError"; then
+            echo "Warning: HistogramNotFoundError for $dataset_dir, skipping."
+            return 0 # This is okay, proceed to the next dataset
+        else
+            echo "Error: Failed to process $dataset_dir" >&2
+            echo "$error_output" >&2
+            echo "1" >"$error_file" # Set error flag in the temporary file
+            return 1                # Error! Stop processing datasets
+        fi
+    else
+        echo "Successfully processed $dataset_dir"
+        echo "$((num_processed + 1))" >"$count_file"
+        return 0 # This is okay, proceed to the next dataset
+    fi
+}
+
+export -f process_dataset
+export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file
+
+# Use find with -exec to process each dataset and handle return codes
+find "$BASE_PATH" \
+    -mindepth "$depth_to_datasets" \
+    -maxdepth "$depth_to_datasets" \
+    -type d \
+    -exec bash -c 'process_dataset "$0"' {} \;
+
+# Check if any errors were flagged
+if [ "$(cat "$error_file")" -ne 0 ]; then
+    echo "Exiting with error (see above)." >&2
+    exit 1
+fi
+
+#######################################################################################
+
+echo "Done."
diff --git a/simprod_histogram/display_histos.py b/simprod_histogram/display_histos.py
@@ -92,7 +92,7 @@ def main():
     parser.add_argument(
         "path",
         type=Path,
-        help="the dataset directory to grab pickled histograms",
+        help="the path to the histogram file (pickle, json, or hdf5)",
     )
     args = parser.parse_args()