-
Notifications
You must be signed in to change notification settings - Fork 9
/
submit_pm_mp.sh
42 lines (36 loc) · 1.2 KB
/
submit_pm_mp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#SBATCH -C gpu
#SBATCH -A ntrain4
#SBATCH --ntasks-per-node 4
#SBATCH --cpus-per-task 32
#SBATCH --gpus-per-node 4
#SBATCH --time=01:00:00
#SBATCH --image=nersc/pytorch:ngc-23.07-v0
#SBATCH --module=gpu,nccl-2.18
#SBATCH --reservation=sc23_dl_tutorial_2
#SBATCH -J vit-era5-mp
#SBATCH -o %x-%j.out
DATADIR=/pscratch/sd/s/shas1693/data/sc23_tutorial_data/downsampled
LOGDIR=${SCRATCH}/sc23-dl-tutorial/logs
mkdir -p ${LOGDIR}
args="${@}"
#args="--config=mp --row_parallel_size=4"
export FI_MR_CACHE_MONITOR=userfaultfd
export HDF5_USE_FILE_LOCKING=FALSE
# Profiling
if [ "${ENABLE_PROFILING:-0}" -eq 1 ]; then
echo "Enabling profiling..."
NSYS_ARGS="--trace=cuda,cublas,nvtx --cuda-graph-trace=node --kill none -c cudaProfilerApi -f true"
NSYS_OUTPUT=${LOGDIR}/${PROFILE_OUTPUT:-"profile"}
export PROFILE_CMD="nsys profile $NSYS_ARGS -o $NSYS_OUTPUT"
fi
export MASTER_ADDR=$(hostname)
# Reversing order of GPUs to match default CPU affinities from Slurm
export CUDA_VISIBLE_DEVICES=3,2,1,0
# if cuda graphs, use train_mp_graphs.py
set -x
srun -u shifter -V ${DATADIR}:/data -V ${LOGDIR}:/logs \
bash -c "
source export_DDP_vars.sh
${PROFILE_CMD} python train_mp.py ${args}
"