diff --git a/src/single-job-multi-mig/README.md b/src/single-job-multi-mig/README.md new file mode 100644 index 0000000..4aa1c0e --- /dev/null +++ b/src/single-job-multi-mig/README.md @@ -0,0 +1,16 @@ + +# Single Job With Multiple MIG Devices - SAME NODE +# README + +Adapted to Speed from: https://wiki.orc.gmu.edu/mkdocs/slurm_with_multiple_mig_devices/ + +This document only shows how to run multiple independent CUDA processes in a single slurm job. Distributed training with pytorch, tensorflow or any other common ML/DL framework is not currently possible using MIG devices + +* We will use 2 MIGs: + + --gres=gpu:nvidia_a100_2g.20gb:2 + +* We will execute the program `gpu_burn` twice in the background (`./gpu_burn -d 60 &`), one for each MIG. +* The job will be executed until the processes in the background finish. (`# wait for all pids` section) + + \ No newline at end of file diff --git a/src/single-job-multi-mig/example-bash.sh b/src/single-job-multi-mig/example-bash.sh new file mode 100644 index 0000000..b51d56c --- /dev/null +++ b/src/single-job-multi-mig/example-bash.sh @@ -0,0 +1,29 @@ +#!/encs/bin/bash + +#SBATCH --job-name=multi-mig-test +#SBATCH --output=%j-%N-out.txt +#SBATCH --error=%j-%N-err.txt +#SBATCH --partition=pt +#SBATCH --ntasks=9 +#SBATCH --mail-type=all +##SBATCH --mem-per-cpu=4G +#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2 + +# module load python/3.9.9-jh +. /encs/pkg/modules-3.2.10/root/Modules/3.2.10/init/bash + +cd /speed-scratch/carlos/gputest/gpu-burn +j=0 +for i in $(echo $CUDA_VISIBLE_DEVICES | tr ',' ' '); do + OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=${i} ./gpu_burn -d 60 & + pids[${j}]=$! + j=$((j+1)) +done + +sleep 20 +nvidia-smi + +# wait for all pids +for pid in ${pids[*]}; do + wait $pid +done diff --git a/src/single-job-multi-mig/example-tcsh.sh b/src/single-job-multi-mig/example-tcsh.sh new file mode 100644 index 0000000..e3a9ea1 --- /dev/null +++ b/src/single-job-multi-mig/example-tcsh.sh @@ -0,0 +1,34 @@ +#!/encs/bin/tcsh + +#SBATCH --job-name=multi-mig-test +#SBATCH --output=%j-%N-out.txt +#SBATCH --error=%j-%N-err.txt +#SBATCH --partition=pt +#SBATCH --ntasks=9 +#SBATCH --mail-type=all +##SBATCH --mem-per-cpu=4G +#SBATCH --gres=gpu:nvidia_a100_2g.20gb:2 + +cd /speed-scratch/carlos/gputest/gpu-burn +set pids = () +foreach i ( `echo $CUDA_VISIBLE_DEVICES | tr ',' ' '` ) + setenv OMP_NUM_THREADS 4 + setenv CUDA_VISIBLE_DEVICES $i + ./gpu_burn -d 60 & + set pids = ( $pids $! ) +end + +sleep 20 +nvidia-smi + +# wait for all pids +set all_done = 0 +while (! $all_done) + set all_done = 1 + foreach pid ($pids) + if ( -e /proc/$pid ) then + set all_done = 0 + endif + end + sleep 1 +end