Skip to content

Commit

Permalink
Output File Updates; Add Wrapper Script [minor] (#3)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <[email protected]>
  • Loading branch information
ric-evans and github-actions authored Nov 21, 2024
1 parent 3be274f commit 2f3d37e
Show file tree
Hide file tree
Showing 7 changed files with 465 additions and 75 deletions.
94 changes: 88 additions & 6 deletions .github/workflows/wipac-cicd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ jobs:
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
- uses: WIPACrepo/[email protected]
with:
python_min: 3.9
python_min: 3.11
python_max: 3.11
pypi_name: icecube-simprod-histogram
author: IceCube
author_email: [email protected]
Expand All @@ -41,8 +42,6 @@ jobs:
- uses: actions/checkout@v3
- id: versions
uses: WIPACrepo/[email protected]
with:
range: ">=3.12"

flake8:
needs: [ py-versions ]
Expand Down Expand Up @@ -83,18 +82,20 @@ jobs:
- uses: actions/setup-python@v4
- name: Run Ruff for code formatting
run: |
set -euo pipefail
pip install ruff
ruff check --select C408 --fix . --unsafe-fixes
- name: Commit formatted code
run: |
set -euo pipefail
git config user.name github-actions
git config user.email [email protected]
git add .
git commit -m "<bot> auto code format file(s)" || true
git push || true
###########################################################################
# UNIT TESTS
# TESTS
###########################################################################

unit-tests:
Expand All @@ -109,19 +110,100 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.py3 }}
- run: |
- name: install
run: |
set -euo pipefail
pip install .[tests]
- name: Run unit tests
run: |
set -euo pipefail
pytest -vvv tests/unit/
test-wrapper-script:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
max_num_datasets:
- 1
- 25
- 100 # aka all of them, currently, there are 48
base_path:
- /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888
- /tmp/data/sim/IceCube/2023/filtered/CORSIKA
- /tmp/data/sim/Upgrade/2022/filtered
- /tmp/data/sim/IceCube/2023
- /tmp/data/sim/Upgrade
- /tmp/data/sim
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v4
- name: Create a mock dataset structure
run: |
set -euo pipefail
job_range_dpaths=(
/tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55}
)
# Create directories and conditionally populate files
for dpath in "${job_range_dpaths[@]}"; do
mkdir -p "$dpath"/histos/
# create 1-5 pkl files
for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do
random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1)
cp "$random_file" "$dpath/histos/histo_$i.pkl"
done
done
- name: Look at filetree (before)
run: |
set -euo pipefail
tree /tmp/data/sim/
- name: Run script with matrix parameters
run: |
set -euo pipefail
set -x
./resources/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }}
- name: Validate script execution
run: |
set -euo pipefail
echo "Max num of datasets: ${{ matrix.max_num_datasets }}"
# Count dataset directories containing at least one "*.histo.hdf5" file
available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l)
echo "Available datasets: $available_datasets"
# Use the lesser of available_datasets and num_datasets for validation
expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} ))
echo "Expected datasets: $expected_num_datasets"
# Check processed count
processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l)
echo "Processed count: $processed_count"
if [[ $processed_count -ne $expected_num_datasets ]]; then
echo "Script did not process the expected number of datasets!"
exit 1
fi
echo "All tests passed."
- name: Look at filetree (after)
run: |
set -euo pipefail
tree /tmp/data/sim/
###########################################################################
# RELEASE
###########################################################################

release:
if: github.ref == 'refs/heads/main'
needs: [ py-setup, flake8, mypy, code-format, unit-tests ]
needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-wrapper-script ]
runs-on: ubuntu-latest
concurrency: release # prevent any possible race conditions
steps:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ name = "icecube-simprod-histogram"
description = "Utilities for working with histograms created for simprod"
readme = "README.md"
keywords = ["histogram sampling", "simulation", "statistics"]
classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13"]
requires-python = ">=3.9, <3.14"
classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.11"]
requires-python = ">=3.11, <3.12"

[[project.authors]]
name = "IceCube"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -euo pipefail

########################################################################################
# Script Name: Simprod Histogram Sampler
Expand All @@ -8,14 +9,14 @@
# to a destination directory in the user's home directory. It also provides
# an option for a dry run to preview actions without making changes.
#
# Usage: cp_src_histos_tree.sh <SOURCE_DIR> [--dryrun]
# Usage: cp-src-histos-tree.sh <SOURCE_DIR> [--dryrun]
#
# Parameters:
# SOURCE_DIR : The source directory containing the "*/histos" directories to sample from.
# --dryrun : Optional flag that, if provided, skips actual file and directory operations,
# outputting actions to be taken without modifying any files.
#
# Example: cp_src_histos_tree.sh /path/to/source --dryrun
# Example: cp-src-histos-tree.sh /path/to/source --dryrun
#
# Notes:
# - Sampling percentages for directories and files are set to 10% by default.
Expand Down
166 changes: 166 additions & 0 deletions resources/sample-each-dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/bin/bash
set -euo pipefail

#######################################################################################
# This script automates the sampling of histograms from dataset directories. It takes
# a base directory containing simulation datasets, a sample percentage for the histograms,
# and the number of datasets to process. It scans each dataset directory to check for
# existing histogram files and skips any datasets that have already been processed.
#
# Usage:
# ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>
#
# Arguments:
# <BASE_PATH> - The root path under which all dataset directories are located.
# Example paths:
# /data/sim/IceCube/2023/generated/neutrino-generator/22645
# /data/sim/IceCube/2023/generated/neutrino-generator/
# /data/sim/IceCube/2023/generated/
# /data/sim/IceCube/2023/
# /data/sim/IceCube/
# <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample
# <MAX_NUM_DATASETS> - Number of datasets to process in this run
#
# Requirements:
# - Python 3
#
#######################################################################################

# Check args
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>"
exit 1
fi

# set BASE_PATH -> scan all datasets under this path
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/
# ex: /data/sim/IceCube/2023/generated/
BASE_PATH=$1

SAMPLE_PERCENTAGE=$2
MAX_NUM_DATASETS=$3

#######################################################################################
# setup python virtual environment, install the package

PYVENV="simprod-histogram-pyvenv"
pip install virtualenv
python -m virtualenv $PYVENV
. $PYVENV/bin/activate &&
pip install --upgrade pip &&
pip install --no-cache-dir icecube-simprod-histogram

#######################################################################################
# pre-calculate depth-to-datasets arg for 'find'

# like /data/sim/IceCube/<year>/<generated>/<neutrino-generator>/<dataset_id>
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 -> depth=0
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ -> depth=1
# ex: /data/sim/IceCube/2023/generated/ -> depth=2
depth_to_datasets=$(python3 -c "
from pathlib import Path
import sys
path = Path(sys.argv[1])
SIM = 'sim'
N_SEGMENTS_BASE_TO_DATASET = 5
try:
base_index = list(path.parts).index(SIM)
except ValueError:
raise ValueError(f'Path {path} does not contain the base identifier {SIM}/')
segments_after_base = path.parts[base_index + 1:]
depth = N_SEGMENTS_BASE_TO_DATASET - len(segments_after_base)
if depth < 0:
raise ValueError(f'Path {path} is too specific; the user can supply up to a dataset dir')
print(depth)
" "$BASE_PATH" 2>&1)

#######################################################################################
# Run!

# Create a temporary file to track errors
error_file=$(mktemp)
echo "0" >"$error_file"
# Create a temporary file to track count
count_file=$(mktemp)
echo "0" >"count_file"
# and rm those files
cleanup() {
rm -f "$error_file"
rm -f "$count_file"
}
trap cleanup EXIT
trap cleanup ERR

# other vars
MAX_REACHED_CODE=2

# Define a helper function to process each dataset
process_dataset() {
local dataset_dir="$1"
local dest_dir="$dataset_dir" # put it into the dataset directory
local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized)

# Stop processing if the specified number of datasets has been reached
if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then
return $MAX_REACHED_CODE # Signals to stop processing datasets
fi

# Check if this dataset has been processed previously
if find "$dest_dir" -maxdepth 1 -name "*.histo.hdf5" | read -r; then
echo "Skipping $dataset_dir, an output file with .histo.hdf5 extension already exists in $dest_dir."
return 0 # This is okay, proceed to the next dataset
fi

# Process the dataset
echo "Processing dataset: $dataset_dir"
local error_output
error_output=$(
python -m simprod_histogram.sample_dataset \
"$dataset_dir" \
--sample-percentage "$SAMPLE_PERCENTAGE" \
--dest-dir "$dest_dir" \
2>&1
)
local exit_status=$?

# Handle subprocess exit status
if [ "$exit_status" -ne 0 ]; then
if echo "$error_output" | grep -q "HistogramNotFoundError"; then
echo "Warning: HistogramNotFoundError for $dataset_dir, skipping."
return 0 # This is okay, proceed to the next dataset
else
echo "Error: Failed to process $dataset_dir" >&2
echo "$error_output" >&2
echo "1" >"$error_file" # Set error flag in the temporary file
return 1 # Error! Stop processing datasets
fi
else
echo "Successfully processed $dataset_dir"
echo "$((num_processed + 1))" >"$count_file"
return 0 # This is okay, proceed to the next dataset
fi
}

export -f process_dataset
export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file

# Use find with -exec to process each dataset and handle return codes
find "$BASE_PATH" \
-mindepth "$depth_to_datasets" \
-maxdepth "$depth_to_datasets" \
-type d \
-exec bash -c 'process_dataset "$0"' {} \;

# Check if any errors were flagged
if [ "$(cat "$error_file")" -ne 0 ]; then
echo "Exiting with error (see above)." >&2
exit 1
fi

#######################################################################################

echo "Done."
2 changes: 1 addition & 1 deletion simprod_histogram/display_histos.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def main():
parser.add_argument(
"path",
type=Path,
help="the dataset directory to grab pickled histograms",
help="the path to the histogram file (pickle, json, or hdf5)",
)
args = parser.parse_args()

Expand Down
Loading

0 comments on commit 2f3d37e

Please sign in to comment.