Skip to content

Commit

Permalink
slurm submission update for Balfrin
Browse files Browse the repository at this point in the history
- larger batch_size bc more vRAM
- more workers bc more cpu cores
  • Loading branch information
Simon Adamov committed Apr 28, 2024
1 parent df6b651 commit dbdd1fd
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 16 deletions.
9 changes: 4 additions & 5 deletions slurm_eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#SBATCH --job-name=NeurWPe
#SBATCH --account=s83
#SBATCH --nodes=1
#SBATCH --gres=gpu:8
#SBATCH --ntasks-per-node=8
#SBATCH --ntasks-per-node=4
#SBATCH --partition=normal
#SBATCH --mem=375G
#SBATCH --mem=444G
#SBATCH --no-requeue
#SBATCH --output=lightning_logs/neurwp_eval_out.log
#SBATCH --error=lightning_logs/neurwp_eval_err.log
Expand Down Expand Up @@ -39,7 +38,7 @@ fi

echo "Evaluating model"
if [ "$MODEL" = "hi_lam" ]; then
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="test"
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 8 --batch_size 12 --subset_ds 1 --model hi_lam --graph hierarchical --load "wandb/example.ckpt" --eval="test"
else
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="test"
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 8 --batch_size 12 --subset_ds 1 --load "wandb/example.ckpt" --eval="test"
fi
2 changes: 1 addition & 1 deletion slurm_param.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --time=24:00:00
#SBATCH --nodes=2
#SBATCH --partition=postproc
#SBATCH --mem=375G
#SBATCH --mem=444G
#SBATCH --no-requeue
#SBATCH --exclusive
#SBATCH --output=lightning_logs/neurwp_param_out.log
Expand Down
7 changes: 4 additions & 3 deletions slurm_predict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#SBATCH --account=s83
#SBATCH --partition=normal
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem=444G
#SBATCH --time=00:59:00
#SBATCH --no-requeue
#SBATCH --output=lightning_logs/neurwp_pred_out.log
Expand Down Expand Up @@ -38,7 +39,7 @@ fi

echo "Predicting with model"
if [ "$MODEL" = "hi_lam" ]; then
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="predict"
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 12 --batch_size 1 --subset_ds 1 --model hi_lam --graph hierarchical --load wandb/example.ckpt --eval="predict"
else
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 4 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="predict"
srun -ul python train_model.py --dataset $DATASET --val_interval 2 --epochs 1 --n_workers 12 --batch_size 1 --subset_ds 1 --load "wandb/example.ckpt" --eval="predict"
fi
14 changes: 7 additions & 7 deletions slurm_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
#SBATCH --job-name=NeurWP
#SBATCH --account=s83
#SBATCH --time=24:00:00
#SBATCH --nodes=5
#SBATCH --gres=gpu:8
#SBATCH --ntasks-per-node=8
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=4
#SBATCH --partition=normal
#SBATCH --mem=375G
#SBATCH --mem=444G
#SBATCH --no-requeue
#SBATCH --exclusive
#SBATCH --output=lightning_logs/neurwp_out.log
#SBATCH --error=lightning_logs/neurwp_err.log

export PREPROCESS=true
export PREPROCESS=false
export NORMALIZE=false
export DATASET="cosmo"
export MODEL="hi_lam"
Expand Down Expand Up @@ -41,7 +40,8 @@ fi

echo "Training model"
if [ "$MODEL" = "hi_lam" ]; then
srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 4 --batch_size 1 --subset_ds 0 --model hi_lam --graph hierarchical
srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 20 --n_workers 8 \
--batch_size 12 --subset_ds 0 --model hi_lam --graph hierarchical
else
srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 4 --batch_size 1 --subset_ds 0
srun -ul python train_model.py --dataset $DATASET --val_interval 20 --epochs 40 --n_workers 8 --batch_size 12 --subset_ds 0
fi

0 comments on commit dbdd1fd

Please sign in to comment.