From 44025609b35608c2ba63117ab4f9be2fc1201bd7 Mon Sep 17 00:00:00 2001
From: carlos-encs <110119864+carlos-encs@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:16:20 -0400
Subject: [PATCH 1/2] OneJob-MultiMigs
---
src/OneJob-MultiMIGs/README.md | 13 +++++++++++
src/OneJob-MultiMIGs/example-bash.sh | 29 ++++++++++++++++++++++++
src/OneJob-MultiMIGs/example-tcsh.sh | 34 ++++++++++++++++++++++++++++
3 files changed, 76 insertions(+)
create mode 100644 src/OneJob-MultiMIGs/README.md
create mode 100644 src/OneJob-MultiMIGs/example-bash.sh
create mode 100644 src/OneJob-MultiMIGs/example-tcsh.sh
diff --git a/src/OneJob-MultiMIGs/README.md b/src/OneJob-MultiMIGs/README.md
new file mode 100644
index 0000000..bde5e25
--- /dev/null
+++ b/src/OneJob-MultiMIGs/README.md
@@ -0,0 +1,13 @@
+
+# One Job With Multiple MIG Devices - SAME NODE
+
+This document only shows how to run multiple independent CUDA processes in a single slurm job. Distributed training with pytorch, tensorflow or any other common ML/DL framework is not currently possible using MIG devices
+
+* We will use 2 MIGs:
+
+ --gres=gpu:nvidia_a100_2g.20gb:2
+
+* We will execute the program `gpu_burn` twice in the background (`./gpu_burn -d 60 &`), one for each MIG.
+* The job will be executed until the processes in the background finish. (`# wait for all pids` section)
+
+
\ No newline at end of file
diff --git a/src/OneJob-MultiMIGs/example-bash.sh b/src/OneJob-MultiMIGs/example-bash.sh
new file mode 100644
index 0000000..b51d56c
--- /dev/null
+++ b/src/OneJob-MultiMIGs/example-bash.sh
@@ -0,0 +1,29 @@
+#!/encs/bin/bash
+
+#SBATCH --job-name=multi-mig-test
+#SBATCH --output=%j-%N-out.txt
+#SBATCH --error=%j-%N-err.txt
+#SBATCH --partition=pt
+#SBATCH --ntasks=9
+#SBATCH --mail-type=all
+##SBATCH --mem-per-cpu=4G
+#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2
+
+# module load python/3.9.9-jh
+. /encs/pkg/modules-3.2.10/root/Modules/3.2.10/init/bash
+
+cd /speed-scratch/carlos/gputest/gpu-burn
+j=0
+for i in $(echo $CUDA_VISIBLE_DEVICES | tr ',' ' '); do
+ OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=${i} ./gpu_burn -d 60 &
+ pids[${j}]=$!
+ j=$((j+1))
+done
+
+sleep 20
+nvidia-smi
+
+# wait for all pids
+for pid in ${pids[*]}; do
+ wait $pid
+done
diff --git a/src/OneJob-MultiMIGs/example-tcsh.sh b/src/OneJob-MultiMIGs/example-tcsh.sh
new file mode 100644
index 0000000..e3a9ea1
--- /dev/null
+++ b/src/OneJob-MultiMIGs/example-tcsh.sh
@@ -0,0 +1,34 @@
+#!/encs/bin/tcsh
+
+#SBATCH --job-name=multi-mig-test
+#SBATCH --output=%j-%N-out.txt
+#SBATCH --error=%j-%N-err.txt
+#SBATCH --partition=pt
+#SBATCH --ntasks=9
+#SBATCH --mail-type=all
+##SBATCH --mem-per-cpu=4G
+#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2
+
+cd /speed-scratch/carlos/gputest/gpu-burn
+set pids = ()
+foreach i ( `echo $CUDA_VISIBLE_DEVICES | tr ',' ' '` )
+ setenv OMP_NUM_THREADS 4
+ setenv CUDA_VISIBLE_DEVICES $i
+ ./gpu_burn -d 60 &
+ set pids = ( $pids $! )
+end
+
+sleep 20
+nvidia-smi
+
+# wait for all pids
+set all_done = 0
+while (! $all_done)
+ set all_done = 1
+ foreach pid ($pids)
+ if ( -e /proc/$pid ) then
+ set all_done = 0
+ endif
+ end
+ sleep 1
+end
From 472f19ad12a5274c5b68d9548eb02bcab29ede73 Mon Sep 17 00:00:00 2001
From: carlos-encs <110119864+carlos-encs@users.noreply.github.com>
Date: Tue, 29 Oct 2024 12:07:31 -0400
Subject: [PATCH 2/2] Formatted + ref added
---
src/{OneJob-MultiMIGs => single-job-multi-mig}/README.md | 5 ++++-
.../example-bash.sh | 0
.../example-tcsh.sh | 0
3 files changed, 4 insertions(+), 1 deletion(-)
rename src/{OneJob-MultiMIGs => single-job-multi-mig}/README.md (77%)
rename src/{OneJob-MultiMIGs => single-job-multi-mig}/example-bash.sh (100%)
rename src/{OneJob-MultiMIGs => single-job-multi-mig}/example-tcsh.sh (100%)
diff --git a/src/OneJob-MultiMIGs/README.md b/src/single-job-multi-mig/README.md
similarity index 77%
rename from src/OneJob-MultiMIGs/README.md
rename to src/single-job-multi-mig/README.md
index bde5e25..4aa1c0e 100644
--- a/src/OneJob-MultiMIGs/README.md
+++ b/src/single-job-multi-mig/README.md
@@ -1,5 +1,8 @@
-# One Job With Multiple MIG Devices - SAME NODE
+# Single Job With Multiple MIG Devices - SAME NODE
+# README
+
+Adapted to Speed from: https://wiki.orc.gmu.edu/mkdocs/slurm_with_multiple_mig_devices/
This document only shows how to run multiple independent CUDA processes in a single slurm job. Distributed training with pytorch, tensorflow or any other common ML/DL framework is not currently possible using MIG devices
diff --git a/src/OneJob-MultiMIGs/example-bash.sh b/src/single-job-multi-mig/example-bash.sh
similarity index 100%
rename from src/OneJob-MultiMIGs/example-bash.sh
rename to src/single-job-multi-mig/example-bash.sh
diff --git a/src/OneJob-MultiMIGs/example-tcsh.sh b/src/single-job-multi-mig/example-tcsh.sh
similarity index 100%
rename from src/OneJob-MultiMIGs/example-tcsh.sh
rename to src/single-job-multi-mig/example-tcsh.sh