From 44025609b35608c2ba63117ab4f9be2fc1201bd7 Mon Sep 17 00:00:00 2001 From: carlos-encs <110119864+carlos-encs@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:16:20 -0400 Subject: [PATCH 1/2] OneJob-MultiMigs --- src/OneJob-MultiMIGs/README.md | 13 +++++++++++ src/OneJob-MultiMIGs/example-bash.sh | 29 ++++++++++++++++++++++++ src/OneJob-MultiMIGs/example-tcsh.sh | 34 ++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 src/OneJob-MultiMIGs/README.md create mode 100644 src/OneJob-MultiMIGs/example-bash.sh create mode 100644 src/OneJob-MultiMIGs/example-tcsh.sh diff --git a/src/OneJob-MultiMIGs/README.md b/src/OneJob-MultiMIGs/README.md new file mode 100644 index 0000000..bde5e25 --- /dev/null +++ b/src/OneJob-MultiMIGs/README.md @@ -0,0 +1,13 @@ + +# One Job With Multiple MIG Devices - SAME NODE + +This document only shows how to run multiple independent CUDA processes in a single slurm job. Distributed training with pytorch, tensorflow or any other common ML/DL framework is not currently possible using MIG devices + +* We will use 2 MIGs: + + --gres=gpu:nvidia_a100_2g.20gb:2 + +* We will execute the program `gpu_burn` twice in the background (`./gpu_burn -d 60 &`), one for each MIG. +* The job will be executed until the processes in the background finish. (`# wait for all pids` section) + + \ No newline at end of file diff --git a/src/OneJob-MultiMIGs/example-bash.sh b/src/OneJob-MultiMIGs/example-bash.sh new file mode 100644 index 0000000..b51d56c --- /dev/null +++ b/src/OneJob-MultiMIGs/example-bash.sh @@ -0,0 +1,29 @@ +#!/encs/bin/bash + +#SBATCH --job-name=multi-mig-test +#SBATCH --output=%j-%N-out.txt +#SBATCH --error=%j-%N-err.txt +#SBATCH --partition=pt +#SBATCH --ntasks=9 +#SBATCH --mail-type=all +##SBATCH --mem-per-cpu=4G +#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2 + +# module load python/3.9.9-jh +. /encs/pkg/modules-3.2.10/root/Modules/3.2.10/init/bash + +cd /speed-scratch/carlos/gputest/gpu-burn +j=0 +for i in $(echo $CUDA_VISIBLE_DEVICES | tr ',' ' '); do + OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=${i} ./gpu_burn -d 60 & + pids[${j}]=$! + j=$((j+1)) +done + +sleep 20 +nvidia-smi + +# wait for all pids +for pid in ${pids[*]}; do + wait $pid +done diff --git a/src/OneJob-MultiMIGs/example-tcsh.sh b/src/OneJob-MultiMIGs/example-tcsh.sh new file mode 100644 index 0000000..e3a9ea1 --- /dev/null +++ b/src/OneJob-MultiMIGs/example-tcsh.sh @@ -0,0 +1,34 @@ +#!/encs/bin/tcsh + +#SBATCH --job-name=multi-mig-test +#SBATCH --output=%j-%N-out.txt +#SBATCH --error=%j-%N-err.txt +#SBATCH --partition=pt +#SBATCH --ntasks=9 +#SBATCH --mail-type=all +##SBATCH --mem-per-cpu=4G +#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2 + +cd /speed-scratch/carlos/gputest/gpu-burn +set pids = () +foreach i ( `echo $CUDA_VISIBLE_DEVICES | tr ',' ' '` ) + setenv OMP_NUM_THREADS 4 + setenv CUDA_VISIBLE_DEVICES $i + ./gpu_burn -d 60 & + set pids = ( $pids $! ) +end + +sleep 20 +nvidia-smi + +# wait for all pids +set all_done = 0 +while (! $all_done) + set all_done = 1 + foreach pid ($pids) + if ( -e /proc/$pid ) then + set all_done = 0 + endif + end + sleep 1 +end From 472f19ad12a5274c5b68d9548eb02bcab29ede73 Mon Sep 17 00:00:00 2001 From: carlos-encs <110119864+carlos-encs@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:07:31 -0400 Subject: [PATCH 2/2] Formatted + ref added --- src/{OneJob-MultiMIGs => single-job-multi-mig}/README.md | 5 ++++- .../example-bash.sh | 0 .../example-tcsh.sh | 0 3 files changed, 4 insertions(+), 1 deletion(-) rename src/{OneJob-MultiMIGs => single-job-multi-mig}/README.md (77%) rename src/{OneJob-MultiMIGs => single-job-multi-mig}/example-bash.sh (100%) rename src/{OneJob-MultiMIGs => single-job-multi-mig}/example-tcsh.sh (100%) diff --git a/src/OneJob-MultiMIGs/README.md b/src/single-job-multi-mig/README.md similarity index 77% rename from src/OneJob-MultiMIGs/README.md rename to src/single-job-multi-mig/README.md index bde5e25..4aa1c0e 100644 --- a/src/OneJob-MultiMIGs/README.md +++ b/src/single-job-multi-mig/README.md @@ -1,5 +1,8 @@ -# One Job With Multiple MIG Devices - SAME NODE +# Single Job With Multiple MIG Devices - SAME NODE +# README + +Adapted to Speed from: https://wiki.orc.gmu.edu/mkdocs/slurm_with_multiple_mig_devices/ This document only shows how to run multiple independent CUDA processes in a single slurm job. Distributed training with pytorch, tensorflow or any other common ML/DL framework is not currently possible using MIG devices diff --git a/src/OneJob-MultiMIGs/example-bash.sh b/src/single-job-multi-mig/example-bash.sh similarity index 100% rename from src/OneJob-MultiMIGs/example-bash.sh rename to src/single-job-multi-mig/example-bash.sh diff --git a/src/OneJob-MultiMIGs/example-tcsh.sh b/src/single-job-multi-mig/example-tcsh.sh similarity index 100% rename from src/OneJob-MultiMIGs/example-tcsh.sh rename to src/single-job-multi-mig/example-tcsh.sh