Skip to content

Commit

Permalink
fix max datasets counting
Browse files Browse the repository at this point in the history
  • Loading branch information
ric-evans committed Nov 20, 2024
1 parent 108714c commit 8d36b9c
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions resources/sample-each-dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -euo pipefail
# existing histogram files and skips any datasets that have already been processed.
#
# Usage:
# ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <NUM_DATASETS>
# ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>
#
# Arguments:
# <BASE_PATH> - The root path under which all dataset directories are located.
Expand All @@ -19,7 +19,7 @@ set -euo pipefail
# /data/sim/IceCube/2023/
# /data/sim/IceCube/
# <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample
# <NUM_DATASETS> - Number of datasets to process in this run
# <MAX_NUM_DATASETS> - Number of datasets to process in this run
#
# Requirements:
# - Python 3
Expand All @@ -28,7 +28,7 @@ set -euo pipefail

# Check args
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <NUM_DATASETS>"
echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>"
exit 1
fi

Expand All @@ -39,7 +39,7 @@ fi
BASE_PATH=$1

SAMPLE_PERCENTAGE=$2
NUM_DATASETS=$3
MAX_NUM_DATASETS=$3

#######################################################################################
# setup python virtual environment, install the package
Expand Down Expand Up @@ -84,22 +84,28 @@ print(depth)
# Create a temporary file to track errors
error_file=$(mktemp)
echo "0" >"$error_file"
# Create a temporary file to track count
count_file=$(mktemp)
echo "0" >"count_file"
# and rm those files
cleanup() {
rm -f "$error_file"
rm -f "$count_file"
}
trap cleanup EXIT
trap cleanup ERR

# other vars
MAX_REACHED_CODE=2
num_processed=0

# Define a helper function to process each dataset
process_dataset() {
local dataset_dir="$1"
local dest_dir="$dataset_dir" # put it into the dataset directory
local dest_dir="$dataset_dir" # put it into the dataset directory
local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized)

# Stop processing if the specified number of datasets has been reached
if [ "$num_processed" -ge "$NUM_DATASETS" ]; then
if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then
return $MAX_REACHED_CODE # Signals to stop processing datasets
fi

Expand Down Expand Up @@ -134,13 +140,13 @@ process_dataset() {
fi
else
echo "Successfully processed $dataset_dir"
num_processed=$((num_processed + 1))
echo "$((num_processed + 1))" >"$count_file"
return 0 # This is okay, proceed to the next dataset
fi
}

export -f process_dataset
export num_processed SAMPLE_PERCENTAGE NUM_DATASETS MAX_REACHED_CODE error_file
export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file

# Use find with -exec to process each dataset and handle return codes
find "$BASE_PATH" \
Expand Down

0 comments on commit 8d36b9c

Please sign in to comment.