From d4135ec82b53ba569a7bae2c9cbfc6563f4d8a1f Mon Sep 17 00:00:00 2001 From: dilyabareeva Date: Fri, 6 Dec 2024 14:32:51 +0100 Subject: [PATCH] chore: add slurm files back --- slurm/apptainer/quanda_build.def | 20 +++++++++++ slurm/apptainer/quanda_pre_build.def | 36 ++++++++++++++++++++ slurm/compute_explanations.sbatch | 38 +++++++++++++++++++++ slurm/compute_explanations.sh | 4 +++ slurm/train.sh | 47 -------------------------- slurm/train_grid.sh | 50 ---------------------------- 6 files changed, 98 insertions(+), 97 deletions(-) create mode 100644 slurm/apptainer/quanda_build.def create mode 100644 slurm/apptainer/quanda_pre_build.def create mode 100644 slurm/compute_explanations.sbatch create mode 100644 slurm/compute_explanations.sh delete mode 100644 slurm/train.sh delete mode 100644 slurm/train_grid.sh diff --git a/slurm/apptainer/quanda_build.def b/slurm/apptainer/quanda_build.def new file mode 100644 index 00000000..50add354 --- /dev/null +++ b/slurm/apptainer/quanda_build.def @@ -0,0 +1,20 @@ +Bootstrap: localimage +From: ./quanda_pre_build.sif + +%files + # Copies over the source code + ./quanda /opt/quanda + +%post + export PYTHONPATH="/opt/quanda/" + export LD_LIBRARY_PATH=/opt/conda/lib + + cd /opt/quanda + pip install .[tutorials] + + # Install wget + apt-get update && apt-get install -y wget unzip + +%runscript + cd /opt/quanda + python -u scripts/compute_explanations.py "$@" diff --git a/slurm/apptainer/quanda_pre_build.def b/slurm/apptainer/quanda_pre_build.def new file mode 100644 index 00000000..f65de100 --- /dev/null +++ b/slurm/apptainer/quanda_pre_build.def @@ -0,0 +1,36 @@ +Bootstrap: docker +From: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + +%environment + export "PATH=/opt/conda/bin:$PATH" + +%post + apt-get -y update + apt install -y git + apt-get -y install curl + + curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh + + chmod +x ~/miniconda.sh + ~/miniconda.sh -b -u -p /opt/conda + rm ~/miniconda.sh + export "PATH=/opt/conda/bin:$PATH" + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/ + + pip install \ + numpy>=1.19.5 \ + torch==2.0.0+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 + torchvision==0.15.0+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 + captum@git+https://github.com/pytorch/captum \ + lightning>=1.4.0 \ + torchmetrics>=1.4.0 \ + tqdm>=4.0.0 \ + traker>=0.3.2 \ + annoy>=1.17.0 \ + datasets>=2.0.0 \ + torcheval>=0.0.6 \ + matplotlib>=3.4.0 \ + pillow>=8.3.0 \ + python-dotenv>=1.0.0 \ + nltk>=3.6.0 \ + wandb>=0.12.10 \ No newline at end of file diff --git a/slurm/compute_explanations.sbatch b/slurm/compute_explanations.sbatch new file mode 100644 index 00000000..9556998c --- /dev/null +++ b/slurm/compute_explanations.sbatch @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --output=%j_%x.out +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=16G +#SBATCH --array=0-0 +#SBATCH --job-name=quanda_explanations + +source "/etc/slurm/local_job_dir.sh" + +# The next line is optional and for job statistics only. You may omit it if you do not need statistics. +echo "$PWD/${SLURM_JOB_ID}_stats.out" > $LOCAL_JOB_DIR/stats_file_loc_cfg + +echo "Extract tiny-imagegetnet-200.tar to ${LOCAL_JOB_DIR}" +time unzip -qq $DATAPOOL3/datasets/tiny-imagenet-200.zip -d $LOCAL_JOB_DIR + +# Make a folder locally on the node for job_results. This folder ensures that data is copied back even when the job fails +mkdir -p "${LOCAL_JOB_DIR}/job_results" +mkdir -p "${LOCAL_JOB_DIR}/tmp" + +# List of methods +methods=("similarity" "representer_points" "tracincpfast" "arnoldi" "trak" "random") + +# Select the method based on the SLURM_ARRAY_TASK_ID +method=${methods[$SLURM_ARRAY_TASK_ID]} + +echo "Compute Explanations" + +apptainer run --nv --env "PYTHONPATH=." --bind $LOCAL_JOB_DIR:/mnt/dataset --bind ${LOCAL_JOB_DIR}/job_results:/mnt/output --bind ${LOCAL_JOB_DIR}/tmp:/mnt/tmp ./quanda_build.sif --method "$method" --tiny_in_path "/mnt/dataset/" --panda_sketch_path "/mnt/tmp/sketch/" --output_dir "/mnt/output" --checkpoints_dir "/mnt/tmp/" --metadata_dir "/mnt/tmp/" --download + +# This command copies all results generated in $LOCAL_JOB_DIR back to the submit folder regarding the job id. +cd "$LOCAL_JOB_DIR" +tar -cf quanda_xpl_${SLURM_JOB_ID}.tar job_results +cp quanda_xpl_${SLURM_JOB_ID}.tar $SLURM_SUBMIT_DIR/quanda_output/ +rm -rf ${LOCAL_JOB_DIR}/job_results +rm -rf ${LOCAL_JOB_DIR}/tmp diff --git a/slurm/compute_explanations.sh b/slurm/compute_explanations.sh new file mode 100644 index 00000000..b14a37c8 --- /dev/null +++ b/slurm/compute_explanations.sh @@ -0,0 +1,4 @@ +#!/bin/bash +apptainer build --fakeroot --force ./quanda_pre_build.sif ./quanda/slurm/apptainer/quanda_pre_build.def +apptainer build --fakeroot --force ./quanda_build.sif ./quanda/slurm/apptainer/quanda_build.def +sbatch ./quanda/slurm/compute_explanations.sbatch diff --git a/slurm/train.sh b/slurm/train.sh deleted file mode 100644 index e16774b3..00000000 --- a/slurm/train.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -#SBATCH --output=%j_%x.out -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=4 -#SBATCH --gres=gpu:1 -#SBATCH --mem=16G -#SBATCH --array=0-0 -#SBATCH --job-name=train_tin - -source "/etc/slurm/local_job_dir.sh" - -# Make a folder locally on the node for job_results. This folder ensures that data is copied back even when the job fails -mkdir -p "${LOCAL_JOB_DIR}/outputs" - -start=`date +%s` - -apptainer run --nv --env "PYTHONPATH=." \ - --bind /data/datapool3/datasets:/mnt/dataset \ - --bind ${LOCAL_JOB_DIR}/outputs:/mnt/output \ - --bind /data/datapool3/datasets/quanda_metadata:/mnt/metadata\ - ../singularity/train.sif \ - --dataset_path "/mnt/dataset" \ - --metadata_path "/mnt/metadata" \ - --output_dir "/mnt/output" \ - --device "cuda" \ - "$@" - - # "--dataset_type", - # "--download", - # "--pretrained", - # "--epochs", - # "--lr", - # "--batch_size", - # "--save_each", - # "--optimizer" - - -end=`date +%s` - -runtime=$((end-start)) -echo "Runtime: $runtime" -# This command copies all results generated in $LOCAL_JOB_DIR back to the submit folder regarding the job id. -cd "$LOCAL_JOB_DIR" -tar -czf train_${SLURM_JOB_ID}.tgz outputs -cp train_${SLURM_JOB_ID}.tgz $SLURM_SUBMIT_DIR -rm -rf ${LOCAL_JOB_DIR}/train_${SLURM_JOB_ID}.tgz diff --git a/slurm/train_grid.sh b/slurm/train_grid.sh deleted file mode 100644 index 7bb8f761..00000000 --- a/slurm/train_grid.sh +++ /dev/null @@ -1,50 +0,0 @@ -for lr in 0.1 -do - for scheduler in constant step - do - for opt in adam - do - for weight_decay in 0.0 0.1 - do - for augmentation in flip flip_rotate - do - sbatch train.sh --dataset_name tiny_imagenet --dataset_type mislabeled --epochs 150 --validate_each 15 --save_each 2 --batch_size 64 --device cuda --optimizer $opt --lr $lr --scheduler $scheduler --weight_decay $weight_decay --pretrained --augmentation $augmentation - done - done - done - done -done - -for lr in 0.1 -do - for scheduler in constant step - do - for opt in adam - do - for weight_decay in 0.0 0.1 - do - for augmentation in flip flip_rotate - do - sbatch train.sh --dataset_name tiny_imagenet --dataset_type mixed --epochs 150 --validate_each 15 --save_each 2 --batch_size 64 --device cuda --optimizer $opt --lr $lr --scheduler $scheduler --weight_decay $weight_decay --pretrained --augmentation $augmentation - done - done - done - done -done - -for lr in 0.1 -do - for scheduler in constant step - do - for opt in adam - do - for weight_decay in 0.0 0.1 - do - for augmentation in flip flip_rotate - do - sbatch train.sh --dataset_name tiny_imagenet --dataset_type shortcut --epochs 150 --validate_each 15 --save_each 2 --batch_size 64 --device cuda --optimizer $opt --lr $lr --scheduler $scheduler --weight_decay $weight_decay --pretrained --augmentation $augmentation - done - done - done - done -done