-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Output File Updates; Add Wrapper Script [minor] (#3)
Co-authored-by: github-actions <[email protected]>
- Loading branch information
Showing
7 changed files
with
465 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,8 @@ jobs: | |
token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} | ||
- uses: WIPACrepo/[email protected] | ||
with: | ||
python_min: 3.9 | ||
python_min: 3.11 | ||
python_max: 3.11 | ||
pypi_name: icecube-simprod-histogram | ||
author: IceCube | ||
author_email: [email protected] | ||
|
@@ -41,8 +42,6 @@ jobs: | |
- uses: actions/checkout@v3 | ||
- id: versions | ||
uses: WIPACrepo/[email protected] | ||
with: | ||
range: ">=3.12" | ||
|
||
flake8: | ||
needs: [ py-versions ] | ||
|
@@ -83,18 +82,20 @@ jobs: | |
- uses: actions/setup-python@v4 | ||
- name: Run Ruff for code formatting | ||
run: | | ||
set -euo pipefail | ||
pip install ruff | ||
ruff check --select C408 --fix . --unsafe-fixes | ||
- name: Commit formatted code | ||
run: | | ||
set -euo pipefail | ||
git config user.name github-actions | ||
git config user.email [email protected] | ||
git add . | ||
git commit -m "<bot> auto code format file(s)" || true | ||
git push || true | ||
########################################################################### | ||
# UNIT TESTS | ||
# TESTS | ||
########################################################################### | ||
|
||
unit-tests: | ||
|
@@ -109,19 +110,100 @@ jobs: | |
- uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.py3 }} | ||
- run: | | ||
- name: install | ||
run: | | ||
set -euo pipefail | ||
pip install .[tests] | ||
- name: Run unit tests | ||
run: | | ||
set -euo pipefail | ||
pytest -vvv tests/unit/ | ||
test-wrapper-script: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
max_num_datasets: | ||
- 1 | ||
- 25 | ||
- 100 # aka all of them, currently, there are 48 | ||
base_path: | ||
- /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888 | ||
- /tmp/data/sim/IceCube/2023/filtered/CORSIKA | ||
- /tmp/data/sim/Upgrade/2022/filtered | ||
- /tmp/data/sim/IceCube/2023 | ||
- /tmp/data/sim/Upgrade | ||
- /tmp/data/sim | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
- name: Set up Python environment | ||
uses: actions/setup-python@v4 | ||
- name: Create a mock dataset structure | ||
run: | | ||
set -euo pipefail | ||
job_range_dpaths=( | ||
/tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55} | ||
) | ||
# Create directories and conditionally populate files | ||
for dpath in "${job_range_dpaths[@]}"; do | ||
mkdir -p "$dpath"/histos/ | ||
# create 1-5 pkl files | ||
for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do | ||
random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1) | ||
cp "$random_file" "$dpath/histos/histo_$i.pkl" | ||
done | ||
done | ||
- name: Look at filetree (before) | ||
run: | | ||
set -euo pipefail | ||
tree /tmp/data/sim/ | ||
- name: Run script with matrix parameters | ||
run: | | ||
set -euo pipefail | ||
set -x | ||
./resources/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }} | ||
- name: Validate script execution | ||
run: | | ||
set -euo pipefail | ||
echo "Max num of datasets: ${{ matrix.max_num_datasets }}" | ||
# Count dataset directories containing at least one "*.histo.hdf5" file | ||
available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l) | ||
echo "Available datasets: $available_datasets" | ||
# Use the lesser of available_datasets and num_datasets for validation | ||
expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} )) | ||
echo "Expected datasets: $expected_num_datasets" | ||
# Check processed count | ||
processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l) | ||
echo "Processed count: $processed_count" | ||
if [[ $processed_count -ne $expected_num_datasets ]]; then | ||
echo "Script did not process the expected number of datasets!" | ||
exit 1 | ||
fi | ||
echo "All tests passed." | ||
- name: Look at filetree (after) | ||
run: | | ||
set -euo pipefail | ||
tree /tmp/data/sim/ | ||
########################################################################### | ||
# RELEASE | ||
########################################################################### | ||
|
||
release: | ||
if: github.ref == 'refs/heads/main' | ||
needs: [ py-setup, flake8, mypy, code-format, unit-tests ] | ||
needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-wrapper-script ] | ||
runs-on: ubuntu-latest | ||
concurrency: release # prevent any possible race conditions | ||
steps: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
#!/bin/bash | ||
set -euo pipefail | ||
|
||
####################################################################################### | ||
# This script automates the sampling of histograms from dataset directories. It takes | ||
# a base directory containing simulation datasets, a sample percentage for the histograms, | ||
# and the number of datasets to process. It scans each dataset directory to check for | ||
# existing histogram files and skips any datasets that have already been processed. | ||
# | ||
# Usage: | ||
# ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS> | ||
# | ||
# Arguments: | ||
# <BASE_PATH> - The root path under which all dataset directories are located. | ||
# Example paths: | ||
# /data/sim/IceCube/2023/generated/neutrino-generator/22645 | ||
# /data/sim/IceCube/2023/generated/neutrino-generator/ | ||
# /data/sim/IceCube/2023/generated/ | ||
# /data/sim/IceCube/2023/ | ||
# /data/sim/IceCube/ | ||
# <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample | ||
# <MAX_NUM_DATASETS> - Number of datasets to process in this run | ||
# | ||
# Requirements: | ||
# - Python 3 | ||
# | ||
####################################################################################### | ||
|
||
# Check args | ||
if [ "$#" -lt 3 ]; then | ||
echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>" | ||
exit 1 | ||
fi | ||
|
||
# set BASE_PATH -> scan all datasets under this path | ||
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 | ||
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ | ||
# ex: /data/sim/IceCube/2023/generated/ | ||
BASE_PATH=$1 | ||
|
||
SAMPLE_PERCENTAGE=$2 | ||
MAX_NUM_DATASETS=$3 | ||
|
||
####################################################################################### | ||
# setup python virtual environment, install the package | ||
|
||
PYVENV="simprod-histogram-pyvenv" | ||
pip install virtualenv | ||
python -m virtualenv $PYVENV | ||
. $PYVENV/bin/activate && | ||
pip install --upgrade pip && | ||
pip install --no-cache-dir icecube-simprod-histogram | ||
|
||
####################################################################################### | ||
# pre-calculate depth-to-datasets arg for 'find' | ||
|
||
# like /data/sim/IceCube/<year>/<generated>/<neutrino-generator>/<dataset_id> | ||
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/22645 -> depth=0 | ||
# ex: /data/sim/IceCube/2023/generated/neutrino-generator/ -> depth=1 | ||
# ex: /data/sim/IceCube/2023/generated/ -> depth=2 | ||
depth_to_datasets=$(python3 -c " | ||
from pathlib import Path | ||
import sys | ||
path = Path(sys.argv[1]) | ||
SIM = 'sim' | ||
N_SEGMENTS_BASE_TO_DATASET = 5 | ||
try: | ||
base_index = list(path.parts).index(SIM) | ||
except ValueError: | ||
raise ValueError(f'Path {path} does not contain the base identifier {SIM}/') | ||
segments_after_base = path.parts[base_index + 1:] | ||
depth = N_SEGMENTS_BASE_TO_DATASET - len(segments_after_base) | ||
if depth < 0: | ||
raise ValueError(f'Path {path} is too specific; the user can supply up to a dataset dir') | ||
print(depth) | ||
" "$BASE_PATH" 2>&1) | ||
|
||
####################################################################################### | ||
# Run! | ||
|
||
# Create a temporary file to track errors | ||
error_file=$(mktemp) | ||
echo "0" >"$error_file" | ||
# Create a temporary file to track count | ||
count_file=$(mktemp) | ||
echo "0" >"count_file" | ||
# and rm those files | ||
cleanup() { | ||
rm -f "$error_file" | ||
rm -f "$count_file" | ||
} | ||
trap cleanup EXIT | ||
trap cleanup ERR | ||
|
||
# other vars | ||
MAX_REACHED_CODE=2 | ||
|
||
# Define a helper function to process each dataset | ||
process_dataset() { | ||
local dataset_dir="$1" | ||
local dest_dir="$dataset_dir" # put it into the dataset directory | ||
local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized) | ||
|
||
# Stop processing if the specified number of datasets has been reached | ||
if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then | ||
return $MAX_REACHED_CODE # Signals to stop processing datasets | ||
fi | ||
|
||
# Check if this dataset has been processed previously | ||
if find "$dest_dir" -maxdepth 1 -name "*.histo.hdf5" | read -r; then | ||
echo "Skipping $dataset_dir, an output file with .histo.hdf5 extension already exists in $dest_dir." | ||
return 0 # This is okay, proceed to the next dataset | ||
fi | ||
|
||
# Process the dataset | ||
echo "Processing dataset: $dataset_dir" | ||
local error_output | ||
error_output=$( | ||
python -m simprod_histogram.sample_dataset \ | ||
"$dataset_dir" \ | ||
--sample-percentage "$SAMPLE_PERCENTAGE" \ | ||
--dest-dir "$dest_dir" \ | ||
2>&1 | ||
) | ||
local exit_status=$? | ||
|
||
# Handle subprocess exit status | ||
if [ "$exit_status" -ne 0 ]; then | ||
if echo "$error_output" | grep -q "HistogramNotFoundError"; then | ||
echo "Warning: HistogramNotFoundError for $dataset_dir, skipping." | ||
return 0 # This is okay, proceed to the next dataset | ||
else | ||
echo "Error: Failed to process $dataset_dir" >&2 | ||
echo "$error_output" >&2 | ||
echo "1" >"$error_file" # Set error flag in the temporary file | ||
return 1 # Error! Stop processing datasets | ||
fi | ||
else | ||
echo "Successfully processed $dataset_dir" | ||
echo "$((num_processed + 1))" >"$count_file" | ||
return 0 # This is okay, proceed to the next dataset | ||
fi | ||
} | ||
|
||
export -f process_dataset | ||
export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file | ||
|
||
# Use find with -exec to process each dataset and handle return codes | ||
find "$BASE_PATH" \ | ||
-mindepth "$depth_to_datasets" \ | ||
-maxdepth "$depth_to_datasets" \ | ||
-type d \ | ||
-exec bash -c 'process_dataset "$0"' {} \; | ||
|
||
# Check if any errors were flagged | ||
if [ "$(cat "$error_file")" -ne 0 ]; then | ||
echo "Exiting with error (see above)." >&2 | ||
exit 1 | ||
fi | ||
|
||
####################################################################################### | ||
|
||
echo "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.