From dbdd1fdd0c13eef92942f94a8751b0670216ddc1 Mon Sep 17 00:00:00 2001 From: Simon Adamov Date: Sun, 28 Apr 2024 21:40:51 +0200 Subject: [PATCH] slurm submission update for Balfrin - larger batch_size bc more vRAM - more workers bc more cpu cores --- slurm_eval.sh | 9 ++++----- slurm_param.sh | 2 +- slurm_predict.sh | 7 ++++--- slurm_train.sh | 14 +++++++------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/slurm_eval.sh b/slurm_eval.sh index 1675fd5f..67cf9eeb 100644 --- a/slurm_eval.sh +++ b/slurm_eval.sh @@ -2,10 +2,9 @@ #SBATCH --job-name=NeurWPe #SBATCH --account=s83 #SBATCH --nodes=1 -#SBATCH --gres=gpu:8 -#SBATCH --ntasks-per-node=8 +#SBATCH --ntasks-per-node=4 #SBATCH --partition=normal -#SBATCH --mem=375G +#SBATCH --mem=444G #SBATCH --no-requeue #SBATCH --output=lightning_logs/neurwp_eval_out.log #SBATCH --error=lightning_logs/neurwp_eval_err.log @@ -39,7 +38,7 @@ fi echo "Evaluating model" if [ "$MODEL" = "hi_lam" ]; then - srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="test" + srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 8 --batch_size 12 --subset_ds 1 --model hi_lam --graph hierarchical --load "wandb/example.ckpt" --eval="test" else - srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="test" + srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 8 --batch_size 12 --subset_ds 1 --load "wandb/example.ckpt" --eval="test" fi diff --git a/slurm_param.sh b/slurm_param.sh index 3c27bf74..ecc98c0e 100644 --- a/slurm_param.sh +++ b/slurm_param.sh @@ -4,7 +4,7 @@ #SBATCH --time=24:00:00 #SBATCH --nodes=2 #SBATCH --partition=postproc -#SBATCH --mem=375G +#SBATCH --mem=444G #SBATCH --no-requeue #SBATCH --exclusive #SBATCH --output=lightning_logs/neurwp_param_out.log diff --git a/slurm_predict.sh b/slurm_predict.sh index e81ceef0..34527ea7 100644 --- a/slurm_predict.sh +++ b/slurm_predict.sh @@ -3,7 +3,8 @@ #SBATCH --account=s83 #SBATCH --partition=normal #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --mem=444G #SBATCH --time=00:59:00 #SBATCH --no-requeue #SBATCH --output=lightning_logs/neurwp_pred_out.log @@ -38,7 +39,7 @@ fi echo "Predicting with model" if [ "$MODEL" = "hi_lam" ]; then - srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="predict" + srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 12 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="predict" else - srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="predict" + srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 12 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="predict" fi diff --git a/slurm_train.sh b/slurm_train.sh index 9ad5cc5e..06ed905c 100644 --- a/slurm_train.sh +++ b/slurm_train.sh @@ -2,17 +2,16 @@ #SBATCH --job-name=NeurWP #SBATCH --account=s83 #SBATCH --time=24:00:00 -#SBATCH --nodes=5 -#SBATCH --gres=gpu:8 -#SBATCH --ntasks-per-node=8 +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=4 #SBATCH --partition=normal -#SBATCH --mem=375G +#SBATCH --mem=444G #SBATCH --no-requeue #SBATCH --exclusive #SBATCH --output=lightning_logs/neurwp_out.log #SBATCH --error=lightning_logs/neurwp_err.log -export PREPROCESS=true +export PREPROCESS=false export NORMALIZE=false export DATASET="cosmo" export MODEL="hi_lam" @@ -41,7 +40,8 @@ fi echo "Training model" if [ "$MODEL" = "hi_lam" ]; then - srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 4 --batch_size 1 --subset_ds 0 --model hi_lam --graph hierarchical + srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 20 --n_workers 8 \ + --batch_size 12 --subset_ds 0 --model hi_lam --graph hierarchical else - srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 4 --batch_size 1 --subset_ds 0 + srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 8 --batch_size 12 --subset_ds 0 fi