Skip to content

Commit

Permalink
add demo on dgx02 for multi GPU training
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed May 15, 2024
1 parent 190935b commit a5f0909
Show file tree
Hide file tree
Showing 15 changed files with 305 additions and 29 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,8 @@ data/
wandb/

.DS_Store
.vscode
.vscode

experiments/*
!experiments/.gitkeep
!experiments/demo*
Empty file added experiments/.gitkeep
Empty file.
39 changes: 39 additions & 0 deletions experiments/demo_dgx2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/bash

ROOT=$(realpath ~)

# location
echo activate virtual ENV
PYTHON_ENV=${ROOT}/project/scripts/v2306.sh
source $PYTHON_ENV

# number of GPUs; here we use all GPUs for demo
WORLD_SIZE=3

# HF cache
export TMPDIR="${ROOT}/project/.cache/"
export HF_DATASETS_CACHE="${ROOT}/project/.cache/dataset"
export HF_HOME="${ROOT}/project/.cache/"

# Wandb
export WANDB_API_KEY="<key>"
export WANDB_USERNAME="xi-yang5"
export WANDB_PROJECT="demo_dgx2"
export WANDB_LOG_MODEL="false"
export WANDB_WATCH="false"

# TORCH and NCCL
export TORCH_DISTRIBUTED_DEBUG=INFO
export NCCL_DEBUG=INFO
# export NCCL_SOCKET_NTHREADS=16

export ACCELERATE_LOG_LEVEL=debug
export ACCELERATE_DEBUG_MODE="1"
export DEEPSPEED_TIMEOUT=120

accelerate launch \
--config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
--num_processes $WORLD_SIZE \
--tee 3 \
${ROOT}/project/alignment_handbook/scripts/run_sft.py \
${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml
15 changes: 15 additions & 0 deletions experiments/demo_dgx2_launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/bash

ROOT=$(realpath ~)

# singularity container
CONTAINER=${ROOT}/project/singularity_containers/py2402.sig

# CUDA
export CUDA_VISIBLE_DEVICES=0,1,2

# PATH
DEMO_PATH=${ROOT}/project/alignment_handbook/experiments

# launch
singularity exec --nv $CONTAINER bash ${DEMO_PATH}/demo_dgx2.sh
2 changes: 1 addition & 1 deletion job_scripts/run_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ export ACCELERATE_LOG_LEVEL=debug
export ACCELERATE_DEBUG_MODE="1"
export DEEPSPEED_TIMEOUT=120

pip install -U git+https://github.com/huggingface/trl
# pip install -U git+https://github.com/huggingface/trl

# export WANDB_PROJECT="alignment"
# export WANDB_WATCH="parameters"
Expand Down
2 changes: 1 addition & 1 deletion job_scripts/singularity_container.def
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Bootstrap: docker
From: nvcr.io/nvidia/pytorch:{{ VERSION }}

%arguments
VERSION=23.12-py3
VERSION=24.04-py3

%files
requirements.txt requirements.txt
Expand Down
1 change: 1 addition & 0 deletions job_scripts/util.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# for slurm use
get_unused_port() {
# Well-known ports end at 1023. On Linux, dynamic ports start at 32768
# (see /proc/sys/net/ipv4/ip_local_port_range).
Expand Down
2 changes: 1 addition & 1 deletion recipes/accelerate_configs/deepspeed_zero2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_init_flag: false
zero3_save_16bit_model: false
zero_stage: 2
mixed_precision: bf16
Expand Down
39 changes: 39 additions & 0 deletions recipes/llama3-8b/dpo/config_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Model arguments
model_name_or_path:
torch_dtype: null

# Data training arguments
# For definitions, see: src/h4/training/config.py
dataset_mixer:
HuggingFaceH4/ultrafeedback_binarized: 1.0
dataset_splits:
- train_prefs
- test_prefs
preprocessing_num_workers: 12

# DPOTrainer arguments
bf16: true
beta: 0.01
do_eval: true
evaluation_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 2
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
learning_rate: 5.0e-7
log_level: info
logging_steps: 10
lr_scheduler_type: cosine
max_length: 1024
max_prompt_length: 512
num_train_epochs: 1
optim: adamw_torch
output_dir:
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
55 changes: 55 additions & 0 deletions recipes/llama3-8b/dpo/config_qlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Model arguments
model_name_or_path:
torch_dtype: bfloat16
use_flash_attention_2: true

# LoRA arguments
use_peft: true
load_in_4bit: true
lora_r: 128
lora_alpha: 128
lora_dropout: 0.05
lora_target_modules: all
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

# Data training arguments

dataset_mixer:
HuggingFaceH4/ultrafeedback_binarized: 1.0
dataset_splits:
- train_prefs
- test_prefs
preprocessing_num_workers: 12

# DPOTrainer arguments
bf16: true
beta: 0.01
do_eval: true
evaluation_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 5.0e-6
log_level: info
logging_steps: 10
lr_scheduler_type: cosine
max_length: 1024
max_prompt_length: 512
num_train_epochs: 1
optim: paged_adamw_32bit
output_dir:
per_device_train_batch_size: 4
per_device_eval_batch_size: 8
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
46 changes: 46 additions & 0 deletions recipes/llama3-8b/sft/config_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B
model_revision: main
torch_dtype: bfloat16
use_flash_attention_2: true

# Data training arguments
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 8

# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
hub_model_id: null
hub_strategy: every_save
learning_rate: 2.0e-05
log_level: info
logging_steps: 5
logging_strategy: steps
lr_scheduler_type: cosine
max_seq_length: 2048
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-full-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_train_batch_size: 16
push_to_hub: false
remove_unused_columns: true
report_to:
- tensorboard
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
60 changes: 60 additions & 0 deletions recipes/llama3-8b/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B # no chat template
model_revision: main
torch_dtype: bfloat16
use_flash_attention_2: true

# LoRA arguments
load_in_4bit: true
use_peft: true
lora_r: 32
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules: all
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

# Data training arguments
chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 16
auto_insert_empty_system_msg: true

# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 1.0e-04
log_level: info
logging_steps: 5
logging_strategy: steps
lr_scheduler_type: cosine
max_seq_length: 4096
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_train_batch_size: 4
push_to_hub: false
report_to:
- tensorboard
# - wandb
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
19 changes: 19 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
accelerate>=0.23.0
datasets>=2.14.6
deepspeed>=0.12.2
einops>=0.6.1
evaluate==0.4.0
huggingface-hub>=0.14.1<1.0
ninja>=1.11.1
packaging>=23.0
parameterized>=0.9.0
peft>=0.6.1
protobuf<=3.20.2
safetensors>=0.3.3
tensorboard
transformers>=4.35.0
trl>=0.7.4
jinja2>=3.0.0
tqdm>=4.64.1
flash-attn>=2.1.0
pynvml>=11.4.0
Loading

0 comments on commit a5f0909

Please sign in to comment.