-
Notifications
You must be signed in to change notification settings - Fork 89
/
1.torch-screen.sbatch
49 lines (37 loc) · 1.13 KB
/
1.torch-screen.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#SBATCH -N 2 # number of nodes to run the scrip on, use 2 here
#SBATCH --job-name=megatron_gpt # name of your job
#SBATCH --ntasks-per-node 1 # Number of tasks per node, we need one here
#SBATCH --gres=gpu:8 # number of GPU we reserve
#SBATCH --exclusive
#SBATCH --wait-all-nodes=1
### Disable hyperthreading by setting the tasks per core to 1
#SBATCH --ntasks-per-core=1
set -ex
# Validate that mpirun does not need -x to propagate env vars defined in .sbatch script
###########################
###### User Variables #####
###########################
# default variables for Enroot
: "${APPS_PATH:=/apps}"
: "${IMAGE:=$APPS_PATH/pytorch-screen.sqsh}"
: "${FSX_MOUNT:=/fsx:/fsx}"
: "${SCREEN_PT_SCRIPT_PATH:=$PWD}"
declare -a ARGS=(
--container-image $IMAGE
--container-mount-home
--container-mounts $FSX_MOUNT
)
echo "
Hostname: $(hostname)
"
env
/usr/bin/time srun -l "${ARGS[@]}" --mpi=pmix bash -c "
which nvidia-smi
nvidia-smi
which python
python --version
python ${SCREEN_PT_SCRIPT_PATH}/pytorch-screen.py
"