Skip to content

Commit

Permalink
add qwen and phi-3 demo config for sft
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyang-aads-lilly committed Aug 19, 2024
1 parent 4c579bc commit 4f70851
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 33 deletions.
18 changes: 14 additions & 4 deletions experiments/demo_magtrain_llm_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ whoami
pwd
ds_report

echo $LD_LIBRARY_PATH

HOME=/home/l069561

ROOT=${HOME}/project/alignment-handbook
Expand All @@ -14,27 +16,35 @@ source ${SCRIPTPATH}/wandb.sh
echo $SLURM_TMPDIR
export TMPDIR="/cache"

export TRITON_CACHE_DIR=${HOME}/project/cache/triton
export TRITON_HOME=${HOME}/project/cache/triton
export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache
export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump
export HF_DATASETS_CACHE=${HOME}/project/cache/dataset
export HF_HOME=${HOME}/project/cache/huggingface

# TORCH and NCCL
export CUDA_LAUNCH_BLOCKING=1
export TORCH_DISTRIBUTED_DEBUG=INFO
# export NCCL_DEBUG=INFO
# export NCCL_SOCKET_NTHREADS=16
export NCCL_SOCKET_NTHREADS=16
export DEEPSPEED_TIMEOUT=120

echo $PRIMARY
echo $PRIMARY_PORT

# TRAIN_CONF=${ROOT}/recipes/llama3-8b/sft/config_full.yaml
TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml
# TRAIN_CONF=${ROOT}/recipes/qwen/sft/config_full.yaml

DEEPSPEED_CONF=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json

torchrun \
--nproc_per_node=$SLURM_GPUS_ON_NODE \
--nnode=$SLURM_JOB_NUM_NODES \
--node_rank=$SLURM_NODEID \
--master_addr=$PRIMARY \
--master_port=$PRIMARY_PORT \
${ROOT}/scripts/run_sft.py \
${ROOT}/recipes/llama3-8b/sft/config_full.yaml \
--deepspeed=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json \
$TRAIN_CONF \
--deepspeed=$DEEPSPEED_CONF \
--tee=2
13 changes: 10 additions & 3 deletions experiments/demo_magtrain_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#SBATCH --job-name=llm_sft
#SBATCH --mail-type=ALL
#SBATCH [email protected]
#SBATCH --nodes=4
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=4
#SBATCH --gpus-per-task=4
Expand All @@ -22,7 +22,14 @@ echo $SLURM_JOB_NUM_NODES
echo $SLURM_GPUS_ON_NODE
source ${SCRIPTPATH}/util.sh

CONTAINER=${HOME}/container/pt2402.sif
# CONTAINER=${HOME}/container/pt2402.sif
CONTAINER=${HOME}/container/pt2402

export TRITON_HOME=${HOME}/project/cache/triton
export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache
export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump
export HF_DATASETS_CACHE=${HOME}/project/cache/dataset
export HF_HOME=${HOME}/project/cache/huggingface

# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh

Expand All @@ -35,7 +42,7 @@ srun --jobid $SLURM_JOB_ID \
--nic-metrics=true \
--capture-range=cudaProfilerApi \
--capture-range-end=stop \
-o /cache/nsys_${SLURM_JOB_ID} \
-o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \
bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh

cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/
7 changes: 4 additions & 3 deletions recipes/llama3-8b/sft/config_full.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B
model_name_or_path: /home/l069561/project/models/Meta-Llama-3.1-8B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
dataset_splits:
- train_sft
- test_sft
Expand Down Expand Up @@ -36,7 +37,7 @@ packing: false
dataset_num_proc: 16
max_steps: -1
num_train_epochs: 3
output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3-full-ultrachat
output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3.1-inst-full-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
Expand Down
43 changes: 28 additions & 15 deletions recipes/phi3/sft/config_full.yaml
Original file line number Diff line number Diff line change
@@ -1,49 +1,62 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B
model_name_or_path: /home/l069561/project/models/Phi-3-small-8k-instruct
model_revision: main
torch_dtype: bfloat16
use_flash_attention_2: true
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 32
preprocessing_num_workers: 16
auto_insert_empty_system_msg: true

# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
# evaluation_strategy: epoch
eval_strategy: epoch
max_grad_norm: 1.0
gradient_accumulation_steps: 1
# gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
use_reentrant: true # this is required for phi-3 (https://huggingface.co/microsoft/Phi-3-small-8k-instruct/discussions/14)
log_level: info
logging_steps: 5
logging_strategy: steps
learning_rate: 2.0e-05
optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
learning_rate: 1.0e-05
optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
optim_target_modules: all-linear
weight_decay: 0.01
lr_scheduler_type: cosine
max_seq_length: 4096
max_seq_length: 8000
packing: false
dataset_num_proc: 16
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/alignment_handbook/experiments/models/llama-3-full-ultrachat
num_train_epochs: 3
output_dir: /home/l069561/project/alignment-handbook/experiments/models/phi-3-small-8k-full-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_train_batch_size: 16
per_device_eval_batch_size: 2
per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
gradient_accumulation_steps: 4
push_to_hub: false
remove_unused_columns: true
report_to:
- tensorboard
- wandb
save_strategy: "steps"
save_strategy: "epoch"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
trust_remote_code: true # only useful for model like phi-3

# torch_compile: true
# # https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
# # https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
# torch_compile_backend: "inductor"
# torch_compile_mode: "default" # reduce-overhead max-autotune
7 changes: 4 additions & 3 deletions recipes/phi3/sft/config_qlora.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B # no chat template
model_name_or_path: /home/l069561/project/models/Phi-3-small-8k-instruct # no chat template
model_revision: main
torch_dtype: bfloat16
use_flash_attention_2: true
Expand All @@ -21,9 +21,10 @@ lora_target_modules: all
# - down_proj

# Data training arguments
chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
dataset_splits:
- train_sft
- test_sft
Expand All @@ -48,7 +49,7 @@ lr_scheduler_type: cosine
max_seq_length: 4096
max_steps: -1
num_train_epochs: 1
output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat
output_dir: /home/l069561/project/alignment-handbook/experiments/models/phi-3-small-8k-full-ultrachat-lora
overwrite_output_dir: true
per_device_train_batch_size: 4
gradient_accumulation_steps: 4
Expand Down
39 changes: 39 additions & 0 deletions recipes/qwen/dpo/config_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Model arguments
model_name_or_path:
torch_dtype: null

# Data training arguments
# For definitions, see: src/h4/training/config.py
dataset_mixer:
HuggingFaceH4/ultrafeedback_binarized: 1.0
dataset_splits:
- train_prefs
- test_prefs
preprocessing_num_workers: 12

# DPOTrainer arguments
bf16: true
beta: 0.01
do_eval: true
evaluation_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 2
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
learning_rate: 5.0e-7
log_level: info
logging_steps: 10
lr_scheduler_type: cosine
max_length: 1024
max_prompt_length: 512
num_train_epochs: 1
optim: adamw_torch
output_dir:
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
55 changes: 55 additions & 0 deletions recipes/qwen/dpo/config_qlora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Model arguments
model_name_or_path:
torch_dtype: bfloat16
use_flash_attention_2: true

# LoRA arguments
use_peft: true
load_in_4bit: true
lora_r: 128
lora_alpha: 128
lora_dropout: 0.05
lora_target_modules: all
# - q_proj
# - k_proj
# - v_proj
# - o_proj
# - gate_proj
# - up_proj
# - down_proj

# Data training arguments

dataset_mixer:
HuggingFaceH4/ultrafeedback_binarized: 1.0
dataset_splits:
- train_prefs
- test_prefs
preprocessing_num_workers: 12

# DPOTrainer arguments
bf16: true
beta: 0.01
do_eval: true
evaluation_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 5.0e-6
log_level: info
logging_steps: 10
lr_scheduler_type: cosine
max_length: 1024
max_prompt_length: 512
num_train_epochs: 1
optim: paged_adamw_32bit
output_dir:
per_device_train_batch_size: 4
per_device_eval_batch_size: 8
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1
55 changes: 55 additions & 0 deletions recipes/qwen/sft/config_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Model arguments
model_name_or_path: /home/l069561/project/models/Qwen2-1.5B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
HuggingFaceH4/ultrachat_200k: 1.0
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
/home/l069561/project/data/sang_data_formatted: 1.0
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 16

# SFT trainer config
bf16: true
do_eval: true
# evaluation_strategy: epoch
eval_strategy: epoch
max_grad_norm: 1.0
# gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
log_level: info
logging_steps: 5
logging_strategy: steps
learning_rate: 2.0e-05
optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
optim_target_modules: all-linear
weight_decay: 0.01
lr_scheduler_type: cosine
max_seq_length: 8192
packing: false
dataset_num_proc: 16
max_steps: -1
num_train_epochs: 3
output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-qwen2-1.5b-inst-full-ultrachat
overwrite_output_dir: true
per_device_eval_batch_size: 2
per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
gradient_accumulation_steps: 4
push_to_hub: false
remove_unused_columns: true
report_to:
- tensorboard
- wandb
save_strategy: "epoch"
save_steps: 100
save_total_limit: 3
seed: 42
warmup_ratio: 0.1
Loading

0 comments on commit 4f70851

Please sign in to comment.