update slurm demo on magtrain

xiyang-aads-lilly · Jun 17, 2024 · cff21cd · cff21cd
1 parent be537fc
commit cff21cd
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 12 deletions.
diff --git a/experiments/demo_magtrain_llm_sft.sh b/experiments/demo_magtrain_llm_sft.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/bash
+whoami
+pwd
+
+HOME=/home/l069561
+
+ROOT=${HOME}/project/alignment-handbook
+
+
+SCRIPTPATH=${ROOT}/experiments
+source ${SCRIPTPATH}/wandb.sh
+
+echo $SLURM_TMPDIR
+export TMPDIR="/cache"
+export HF_DATASETS_CACHE="${HOME}/cache/dataset"
+export HF_HOME="${HOME}/cache/hf"
+export TRITON_CACHE_DIR="/cache"
+
+
+# TORCH and NCCL
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_DISTRIBUTED_DEBUG=INFO
+# export NCCL_DEBUG=INFO
+# export NCCL_SOCKET_NTHREADS=16
+export DEEPSPEED_TIMEOUT=120
+
+# export WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_NTASKS_PER_NODE))
+
+echo $PRIMARY
+echo $PRIMARY_PORT
+
+torchrun \
+    --nproc_per_node=$SLURM_GPUS_ON_NODE  \
+    --nnode=$SLURM_JOB_NUM_NODES \
+    --node_rank=$SLURM_NODEID  \
+    --master_addr=$PRIMARY \
+    --master_port=$PRIMARY_PORT \
+    ${ROOT}/scripts/run_sft.py \
+    ${ROOT}/recipes/llama3-8b/sft/config_qlora.yaml \
+    --deepspeed=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json \
+    --tee=2
diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh
@@ -1,7 +1,33 @@
+#!/bin/bash
 
+#SBATCH --job-name=llm_sft
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=4
+#SBATCH --gpus-per-task=4
+#SBATCH --cpus-per-task=80
+#SBATCH --mem=512gb
+#SBATCH --time=48:00:00
+#SBATCH --output=/home/l069561/project/log/alignment/sft_%j.out
+#SBATCH --partition=batch
+##SBATCH --exclusive
+##SBATCH --reservation=gatortrongpt
 
-SCRIPT=$(readlink -f "$0")
-SCRIPTPATH=$(dirname "$SCRIPT")
+HOME=/home/l069561
+SCRIPTPATH=${HOME}/project/alignment-handbook/experiments
 
+echo $SCRIPTPATH
+echo $SLURM_NTASKS_PER_NODE
+echo $SLURM_JOB_NUM_NODES
+echo $SLURM_GPUS_ON_NODE
 source ${SCRIPTPATH}/util.sh
-source ${SCRIPTPATH}/wandb.sh
+
+CONTAINER=${HOME}/container/pt2402.sif
+
+srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache  --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
+
+# NSYS=nsys profile -t cuda,nvtx -o /cache/nsys
+# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache  --nv $CONTAINER ${NSYS} bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
+# cp $SLURM_TMPDIR/nsys-rep /home/l069561/project/log/
diff --git a/recipes/llama3-8b/sft/config_qlora.yaml b/recipes/llama3-8b/sft/config_qlora.yaml
@@ -27,7 +27,7 @@ dataset_mixer:
 dataset_splits:
 - train_sft
 - test_sft
-preprocessing_num_workers: 32
+preprocessing_num_workers: 16
 auto_insert_empty_system_msg: true
 
 # SFT trainer config
@@ -48,7 +48,7 @@ lr_scheduler_type: cosine
 max_seq_length: 4096
 max_steps: -1
 num_train_epochs: 1
-output_dir: /home/l069561/project/alignment_handbook/experiments/models/demo-llama-3-8b-lora-ultrachat
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat
 overwrite_output_dir: true
 per_device_train_batch_size: 4
 gradient_accumulation_steps: 4

diff --git a/requirement.sh b/requirement.sh
@@ -0,0 +1,2 @@
+pip install transformers==4.39.2
+pip install trl==0.8.2
diff --git a/requirements.txt b/requirements.txt
@@ -3,23 +3,27 @@ datasets>=2.14.6
 deepspeed>=0.12.2
 einops>=0.6.1
 evaluate==0.4.0
+flash-attn>=2.1.0
 huggingface-hub>=0.14.1<1.0
+jinja2>=3.0.0
 ninja>=1.11.1
 packaging>=23.0
 parameterized>=0.9.0
 peft>=0.6.1
-protobuf<=3.20.2
+protobuf<=3.20.3
+pynvml>=11.4.0
 safetensors>=0.3.3
+
+sentencepiece
 tensorboard
+tqdm>=4.64.1
 transformers>=4.35.0
 trl>=0.7.4
-jinja2>=3.0.0
-tqdm>=4.64.1
-flash-attn>=2.1.0
-pynvml>=11.4.0
+wandb
+
 
 # optional
-galore-torch
+# galore-torch
 
 # unsloth
-# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps
+# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		pip install transformers==4.39.2
		pip install trl==0.8.2