Skip to content

Commit

Permalink
Refactor SLURM submission script and remove unnecessary code
Browse files Browse the repository at this point in the history
  • Loading branch information
egorkrash committed Mar 22, 2024
1 parent 64755ac commit e40b169
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 56 deletions.
6 changes: 2 additions & 4 deletions src/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,8 @@ def main(config_name):
experiment_folder = finetuning_pipeline.experiment_folder
n_gpu_hours = config.experiment_arguments.n_gpu_hours
slurm_sl = config.experiment_arguments.slurm_sl

# Determine if we are on CAIS or Cambridge cluster # TODO make this less hacky
cais = True if '/data/dmitrii_krasheninnikov' in workdir else False
slurm_args = f'--partition ampere --account KRUEGER-{slurm_sl.upper()}-GPU' if not cais else '--partition=single'

slurm_args = f'--partition ampere --account KRUEGER-{slurm_sl.upper()}-GPU'

sbatch_command = (f'sbatch {slurm_args} --time={n_gpu_hours}:00:00 '
f'src/slurm_submit_script \"{application}\" \"{options}\" \"{workdir}\" \"{experiment_folder}\"')
Expand Down
54 changes: 2 additions & 52 deletions src/slurm_submit_script
Original file line number Diff line number Diff line change
@@ -1,81 +1,31 @@
#!/bin/bash
#!
#! SLURM job script for Wilkes3 (AMD EPYC 7763, ConnectX-6, A100)
#!

#!#############################################################
#!#### Modify the options in this section as appropriate ######
#!#############################################################


#! sbatch directives begin here ###############################
#! Name of the job:
#SBATCH -J internalization-gpu

#! How many whole nodes should be allocated?
#SBATCH --nodes=1
#! How many (MPI) tasks will there be in total?
#! Note probably this should not exceed the total number of GPUs in use.
#SBATCH --ntasks=1

#! Specify the number of GPUs per node (between 1 and 4; must be 4 if nodes>1).
#! Note that the job submission script will enforce no more than 32 cpus per GPU.
#SBATCH --gres=gpu:1

#! What types of email messages do you wish to receive?
#SBATCH --mail-type=ALL
#! Uncomment this to prevent the job from being requeued (e.g. if
#! interrupted by node failure or system downtime):
##SBATCH --no-requeue

#! sbatch directives end here (put any additional directives above this line)

#! Notes:
#! Charging is determined by GPU number*walltime.

#! Number of nodes and tasks per node allocated by SLURM (do not change):
numnodes=$SLURM_JOB_NUM_NODES
numtasks=$SLURM_NTASKS
mpi_tasks_per_node=$(echo "$SLURM_TASKS_PER_NODE" | sed -e 's/^\([0-9][0-9]*\).*$/\1/')
#! ############################################################

#! Modify the settings below to specify the application's environment, location
#! and launch method:

application=$1 #! Full path to application executable:
options=$2
#! Work directory (i.e. where the job will run):
workdir=$3 # The value of SLURM_SUBMIT_DIR sets workdir to the directory in which sbatch is run.
experiment_folder=$4

#! Optionally modify the environment seen by the application
#! (note that SLURM reproduces the environment at submission irrespective of ~/.bashrc):

. /etc/profile.d/modules.sh # Leave this line (enables the module command)
module purge # Removes all modules still loaded
module load rhel8/default-amp # REQUIRED - loads the basic environment
#! Insert additional module load commands after this line if needed:
conda init bash
conda activate gpt

#! Are you using OpenMP (NB this is unrelated to OpenMPI)? If so increase this
#! safe value to no more than 128:
export OMP_NUM_THREADS=1

#! Number of MPI tasks to be started by the application per node and in total (do not change):
np=$[${numnodes}*${mpi_tasks_per_node}]

#! Choose this for a pure shared-memory OpenMP parallel program on a single node:
#! (OMP_NUM_THREADS threads will be created):

CMD="$application $options"
CMD_RM="rm machine.file.$SLURM_JOB_ID*"
CMD_MV="mv slurm-$SLURM_JOB_ID*.out $experiment_folder"
#! Choose this for a MPI code using OpenMPI:
#CMD="mpirun -npernode $mpi_tasks_per_node -np $np $application $options"


###############################################################
### You should not have to change anything below this line ####
###############################################################

cd $workdir
echo -e "Changed directory to `pwd`.\n"
Expand Down

0 comments on commit e40b169

Please sign in to comment.