Skip to content

Commit

Permalink
update slurm demo on magtrain
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed Jun 17, 2024
1 parent be537fc commit cff21cd
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 12 deletions.
41 changes: 41 additions & 0 deletions experiments/demo_magtrain_llm_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/bash
whoami
pwd

HOME=/home/l069561

ROOT=${HOME}/project/alignment-handbook


SCRIPTPATH=${ROOT}/experiments
source ${SCRIPTPATH}/wandb.sh

echo $SLURM_TMPDIR
export TMPDIR="/cache"
export HF_DATASETS_CACHE="${HOME}/cache/dataset"
export HF_HOME="${HOME}/cache/hf"
export TRITON_CACHE_DIR="/cache"


# TORCH and NCCL
export CUDA_LAUNCH_BLOCKING=1
export TORCH_DISTRIBUTED_DEBUG=INFO
# export NCCL_DEBUG=INFO
# export NCCL_SOCKET_NTHREADS=16
export DEEPSPEED_TIMEOUT=120

# export WORLD_SIZE=$(($SLURM_JOB_NUM_NODES*$SLURM_NTASKS_PER_NODE))

echo $PRIMARY
echo $PRIMARY_PORT

torchrun \
--nproc_per_node=$SLURM_GPUS_ON_NODE \
--nnode=$SLURM_JOB_NUM_NODES \
--node_rank=$SLURM_NODEID \
--master_addr=$PRIMARY \
--master_port=$PRIMARY_PORT \
${ROOT}/scripts/run_sft.py \
${ROOT}/recipes/llama3-8b/sft/config_qlora.yaml \
--deepspeed=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json \
--tee=2
32 changes: 29 additions & 3 deletions experiments/demo_magtrain_slurm.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,33 @@
#!/bin/bash

#SBATCH --job-name=llm_sft
#SBATCH --mail-type=ALL
#SBATCH [email protected]
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=4
#SBATCH --gpus-per-task=4
#SBATCH --cpus-per-task=80
#SBATCH --mem=512gb
#SBATCH --time=48:00:00
#SBATCH --output=/home/l069561/project/log/alignment/sft_%j.out
#SBATCH --partition=batch
##SBATCH --exclusive
##SBATCH --reservation=gatortrongpt

SCRIPT=$(readlink -f "$0")
SCRIPTPATH=$(dirname "$SCRIPT")
HOME=/home/l069561
SCRIPTPATH=${HOME}/project/alignment-handbook/experiments

echo $SCRIPTPATH
echo $SLURM_NTASKS_PER_NODE
echo $SLURM_JOB_NUM_NODES
echo $SLURM_GPUS_ON_NODE
source ${SCRIPTPATH}/util.sh
source ${SCRIPTPATH}/wandb.sh

CONTAINER=${HOME}/container/pt2402.sif

srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh

# NSYS=nsys profile -t cuda,nvtx -o /cache/nsys
# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER ${NSYS} bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
# cp $SLURM_TMPDIR/nsys-rep /home/l069561/project/log/
4 changes: 2 additions & 2 deletions recipes/llama3-8b/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dataset_mixer:
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 32
preprocessing_num_workers: 16
auto_insert_empty_system_msg: true

# SFT trainer config
Expand All @@ -48,7 +48,7 @@ lr_scheduler_type: cosine
max_seq_length: 4096
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/alignment_handbook/experiments/models/demo-llama-3-8b-lora-ultrachat
output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat
overwrite_output_dir: true
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
Expand Down
2 changes: 2 additions & 0 deletions requirement.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pip install transformers==4.39.2
pip install trl==0.8.2
18 changes: 11 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@ datasets>=2.14.6
deepspeed>=0.12.2
einops>=0.6.1
evaluate==0.4.0
flash-attn>=2.1.0
huggingface-hub>=0.14.1<1.0
jinja2>=3.0.0
ninja>=1.11.1
packaging>=23.0
parameterized>=0.9.0
peft>=0.6.1
protobuf<=3.20.2
protobuf<=3.20.3
pynvml>=11.4.0
safetensors>=0.3.3

sentencepiece
tensorboard
tqdm>=4.64.1
transformers>=4.35.0
trl>=0.7.4
jinja2>=3.0.0
tqdm>=4.64.1
flash-attn>=2.1.0
pynvml>=11.4.0
wandb


# optional
galore-torch
# galore-torch

# unsloth
# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps
# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps

0 comments on commit cff21cd

Please sign in to comment.