diff --git a/resources/sample-each-dataset.sh b/resources/sample-each-dataset.sh index 9af47af..ca4fa5a 100755 --- a/resources/sample-each-dataset.sh +++ b/resources/sample-each-dataset.sh @@ -8,7 +8,7 @@ set -euo pipefail # existing histogram files and skips any datasets that have already been processed. # # Usage: -# ./sample-each-dataset.sh +# ./sample-each-dataset.sh # # Arguments: # - The root path under which all dataset directories are located. @@ -19,7 +19,7 @@ set -euo pipefail # /data/sim/IceCube/2023/ # /data/sim/IceCube/ # - Percentage of a dataset's histograms to sample -# - Number of datasets to process in this run +# - Number of datasets to process in this run # # Requirements: # - Python 3 @@ -28,7 +28,7 @@ set -euo pipefail # Check args if [ "$#" -lt 3 ]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi @@ -39,7 +39,7 @@ fi BASE_PATH=$1 SAMPLE_PERCENTAGE=$2 -NUM_DATASETS=$3 +MAX_NUM_DATASETS=$3 ####################################################################################### # setup python virtual environment, install the package @@ -84,22 +84,28 @@ print(depth) # Create a temporary file to track errors error_file=$(mktemp) echo "0" >"$error_file" +# Create a temporary file to track count +count_file=$(mktemp) +echo "0" >"count_file" +# and rm those files cleanup() { rm -f "$error_file" + rm -f "$count_file" } trap cleanup EXIT +trap cleanup ERR # other vars MAX_REACHED_CODE=2 -num_processed=0 # Define a helper function to process each dataset process_dataset() { local dataset_dir="$1" - local dest_dir="$dataset_dir" # put it into the dataset directory + local dest_dir="$dataset_dir" # put it into the dataset directory + local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized) # Stop processing if the specified number of datasets has been reached - if [ "$num_processed" -ge "$NUM_DATASETS" ]; then + if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then return $MAX_REACHED_CODE # Signals to stop processing datasets fi @@ -134,13 +140,13 @@ process_dataset() { fi else echo "Successfully processed $dataset_dir" - num_processed=$((num_processed + 1)) + echo "$((num_processed + 1))" >"$count_file" return 0 # This is okay, proceed to the next dataset fi } export -f process_dataset -export num_processed SAMPLE_PERCENTAGE NUM_DATASETS MAX_REACHED_CODE error_file +export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file # Use find with -exec to process each dataset and handle return codes find "$BASE_PATH" \