fix max datasets counting

icecube · Nov 20, 2024 · 8d36b9c · 8d36b9c
1 parent 108714c
commit 8d36b9c
Showing 1 changed file with 15 additions and 9 deletions.
diff --git a/resources/sample-each-dataset.sh b/resources/sample-each-dataset.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 # existing histogram files and skips any datasets that have already been processed.
 #
 # Usage:
-#   ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <NUM_DATASETS>
+#   ./sample-each-dataset.sh <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>
 #
 # Arguments:
 #   <BASE_PATH>         - The root path under which all dataset directories are located.
@@ -19,7 +19,7 @@ set -euo pipefail
 #                         /data/sim/IceCube/2023/
 #                         /data/sim/IceCube/
 #   <SAMPLE_PERCENTAGE> - Percentage of a dataset's histograms to sample
-#   <NUM_DATASETS>      - Number of datasets to process in this run
+#   <MAX_NUM_DATASETS>      - Number of datasets to process in this run
 #
 # Requirements:
 # - Python 3
@@ -28,7 +28,7 @@ set -euo pipefail
 
 # Check args
 if [ "$#" -lt 3 ]; then
-    echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <NUM_DATASETS>"
+    echo "Usage: $0 <BASE_PATH> <SAMPLE_PERCENTAGE> <MAX_NUM_DATASETS>"
     exit 1
 fi
 
@@ -39,7 +39,7 @@ fi
 BASE_PATH=$1
 
 SAMPLE_PERCENTAGE=$2
-NUM_DATASETS=$3
+MAX_NUM_DATASETS=$3
 
 #######################################################################################
 # setup python virtual environment, install the package
@@ -84,22 +84,28 @@ print(depth)
 # Create a temporary file to track errors
 error_file=$(mktemp)
 echo "0" >"$error_file"
+# Create a temporary file to track count
+count_file=$(mktemp)
+echo "0" >"count_file"
+# and rm those files
 cleanup() {
     rm -f "$error_file"
+    rm -f "$count_file"
 }
 trap cleanup EXIT
+trap cleanup ERR
 
 # other vars
 MAX_REACHED_CODE=2
-num_processed=0
 
 # Define a helper function to process each dataset
 process_dataset() {
     local dataset_dir="$1"
-    local dest_dir="$dataset_dir" # put it into the dataset directory
+    local dest_dir="$dataset_dir"            # put it into the dataset directory
+    local num_processed=$(cat "$count_file") # get the count from the file (wouldn't work if parallelized)
 
     # Stop processing if the specified number of datasets has been reached
-    if [ "$num_processed" -ge "$NUM_DATASETS" ]; then
+    if [ "$num_processed" -ge "$MAX_NUM_DATASETS" ]; then
         return $MAX_REACHED_CODE # Signals to stop processing datasets
     fi
 
@@ -134,13 +140,13 @@ process_dataset() {
         fi
     else
         echo "Successfully processed $dataset_dir"
-        num_processed=$((num_processed + 1))
+        echo "$((num_processed + 1))" >"$count_file"
         return 0 # This is okay, proceed to the next dataset
     fi
 }
 
 export -f process_dataset
-export num_processed SAMPLE_PERCENTAGE NUM_DATASETS MAX_REACHED_CODE error_file
+export SAMPLE_PERCENTAGE MAX_NUM_DATASETS MAX_REACHED_CODE error_file count_file
 
 # Use find with -exec to process each dataset and handle return codes
 find "$BASE_PATH" \