From 91ce3c961f4187f2b9632358153b6889487acdbd Mon Sep 17 00:00:00 2001
From: Emmanuel Leblond <emmanuel.leblond@gmail.com>
Date: Tue, 1 Oct 2024 11:38:45 +0200
Subject: [PATCH] Update slurm check_queue.sh according to actual parameters
 used in prod

---
 in2p3/slurm/check_queue.sh       | 30 +++++++++++++++++++++++-------
 in2p3/start_slurm_check_queue.sh |  6 ++++--
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/in2p3/slurm/check_queue.sh b/in2p3/slurm/check_queue.sh
index f0bffe9..2ae7b35 100644
--- a/in2p3/slurm/check_queue.sh
+++ b/in2p3/slurm/check_queue.sh
@@ -26,7 +26,11 @@ COND_INIT_PATH=$MINICONDA3_DIR/etc/profile.d/conda.sh
 SELF_SCRIPT_PATH=$VIGIECHIRO_DIR/slurm/check_queue.sh
 WORKER_SCRIPT_PATH=$VIGIECHIRO_DIR/slurm/worker.sh
 WORKER_JOB_NAME="w-$VIGIECHIRO_ENV_NAME"
-WORKER_JOB_OPTIONS="--ntasks=8 --mem=16G --job-name=$WORKER_JOB_NAME"
+# Set max time (i.e. `--time` param) to 2 days, basically we have:
+# - 4% of jobs >5h
+# - 1% of jobs >12h
+# - Longest observed so far is 54h
+WORKER_JOB_OPTIONS="--time=2-00:00:00 --cpus-per-task=1 --mem=32GB --constraint el9 --job-name=$WORKER_JOB_NAME"
 
 function get_scheduled_workers() {
     echo "import subprocess
@@ -38,11 +42,9 @@ print(len([l for l in out.splitlines()[1:] if l.strip()]))
 " | python
 }
 
-
-
-# Run the script for 60 * 600 == 10 hours
+# Run the script for 60 * 120 == 2 hours
 # This is much lower than the default maximum time (i.e. 7 days)
-for i in `seq 600`
+for i in `seq 120`
 do
     PENDINGS=`python $VIGIECHIRO_DIR/vigiechiro-api/bin/queuer.py pendings`
     SCHEDULED_WORKERS=`get_scheduled_workers`
@@ -61,12 +63,26 @@ do
     else
         printf "[$(date)] no pending jobs\n"
     fi
+    # Remove core dump given they flood very fast the home directory
+    rm -fv $HOME/vigiechiro-prod/Tadarida-C/tadaridaC_src/core.*
     sleep 60
 done
 
-# Then reload itself to prevent beeing killed by quotas
+# Then reload itself to prevent being killed by quotas
 # note: Couldn't use $(readlink -f $0) given slurm copy the
 # script in a temp folder before running it
 echo "[$(date)] restarting daemon"
+
+# --export option provides $VIGIECHIRO_DIR to the job process
 # --job-name=$SLURM_JOB_NAME seems broken, hence we must set this by using envvar
-SBATCH_JOB_NAME=$SLURM_JOB_NAME sbatch $SELF_SCRIPT_PATH
+# `--mem` `--time` and `--cpus-per-task` are rough estimates considering
+# `check_queue.sh` is a single small script restarting itself every 2h.
+#
+# /!\ This command must be similar than the one in `start_slurm_check_queue.sh` /!\
+SBATCH_JOB_NAME=$JOB_NAME sbatch \
+    --export=VIGIECHIRO_DIR \
+    --mem=100MB \
+    --time=0-04:00:00 \
+    --constraint el9 \
+    --cpus-per-task=1 \
+    $VIGIECHIRO_DIR/slurm/check_queue.sh
diff --git a/in2p3/start_slurm_check_queue.sh b/in2p3/start_slurm_check_queue.sh
index fe9a818..b9c5252 100644
--- a/in2p3/start_slurm_check_queue.sh
+++ b/in2p3/start_slurm_check_queue.sh
@@ -11,11 +11,13 @@ pushd $LOG_FOLDER
 # --export option provides $VIGIECHIRO_DIR to the job process
 # --job-name=$JOB_NAME seems broken, hence we must set this by using envvar
 # `--mem` `--time` and `--cpus-per-task` are rough estimates considering
-# `check_queue.sh` is a single small script restarting itself every 10h.
+# `check_queue.sh` is a single small script restarting itself every 2h.
+#
+# /!\ This command must be similar than the one in `slurm/check_queue.sh` /!\
 SBATCH_JOB_NAME=$JOB_NAME sbatch \
     --export=VIGIECHIRO_DIR \
     --mem=100MB \
-    --time=0-06:00:00 \
+    --time=0-04:00:00 \
     --constraint el9 \
     --cpus-per-task=1 \
     $VIGIECHIRO_DIR/slurm/check_queue.sh