forked from huggingface/alignment-handbook
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add qwen and phi-3 demo config for sft
- Loading branch information
1 parent
4c579bc
commit 4f70851
Showing
14 changed files
with
284 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
#SBATCH --job-name=llm_sft | ||
#SBATCH --mail-type=ALL | ||
#SBATCH [email protected] | ||
#SBATCH --nodes=4 | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=1 | ||
#SBATCH --gpus-per-node=4 | ||
#SBATCH --gpus-per-task=4 | ||
|
@@ -22,7 +22,14 @@ echo $SLURM_JOB_NUM_NODES | |
echo $SLURM_GPUS_ON_NODE | ||
source ${SCRIPTPATH}/util.sh | ||
|
||
CONTAINER=${HOME}/container/pt2402.sif | ||
# CONTAINER=${HOME}/container/pt2402.sif | ||
CONTAINER=${HOME}/container/pt2402 | ||
|
||
export TRITON_HOME=${HOME}/project/cache/triton | ||
export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache | ||
export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump | ||
export HF_DATASETS_CACHE=${HOME}/project/cache/dataset | ||
export HF_HOME=${HOME}/project/cache/huggingface | ||
|
||
# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh | ||
|
||
|
@@ -35,7 +42,7 @@ srun --jobid $SLURM_JOB_ID \ | |
--nic-metrics=true \ | ||
--capture-range=cudaProfilerApi \ | ||
--capture-range-end=stop \ | ||
-o /cache/nsys_${SLURM_JOB_ID} \ | ||
-o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \ | ||
bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh | ||
|
||
cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,62 @@ | ||
# Model arguments | ||
model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B | ||
model_name_or_path: /home/l069561/project/models/Phi-3-small-8k-instruct | ||
model_revision: main | ||
torch_dtype: bfloat16 | ||
use_flash_attention_2: true | ||
attn_implementation: flash_attention_2 | ||
|
||
# Data training arguments | ||
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" | ||
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" | ||
dataset_mixer: | ||
HuggingFaceH4/ultrachat_200k: 1.0 | ||
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset | ||
dataset_splits: | ||
- train_sft | ||
- test_sft | ||
preprocessing_num_workers: 32 | ||
preprocessing_num_workers: 16 | ||
auto_insert_empty_system_msg: true | ||
|
||
# SFT trainer config | ||
bf16: true | ||
do_eval: true | ||
evaluation_strategy: epoch | ||
# evaluation_strategy: epoch | ||
eval_strategy: epoch | ||
max_grad_norm: 1.0 | ||
gradient_accumulation_steps: 1 | ||
# gradient_accumulation_steps: 16 | ||
gradient_checkpointing: true | ||
gradient_checkpointing_kwargs: | ||
use_reentrant: False | ||
use_reentrant: true # this is required for phi-3 (https://huggingface.co/microsoft/Phi-3-small-8k-instruct/discussions/14) | ||
log_level: info | ||
logging_steps: 5 | ||
logging_strategy: steps | ||
learning_rate: 2.0e-05 | ||
optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit | ||
learning_rate: 1.0e-05 | ||
optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit | ||
optim_target_modules: all-linear | ||
weight_decay: 0.01 | ||
lr_scheduler_type: cosine | ||
max_seq_length: 4096 | ||
max_seq_length: 8000 | ||
packing: false | ||
dataset_num_proc: 16 | ||
max_steps: -1 | ||
num_train_epochs: 1 | ||
output_dir: /home/l069561/project/alignment_handbook/experiments/models/llama-3-full-ultrachat | ||
num_train_epochs: 3 | ||
output_dir: /home/l069561/project/alignment-handbook/experiments/models/phi-3-small-8k-full-ultrachat | ||
overwrite_output_dir: true | ||
per_device_eval_batch_size: 8 | ||
per_device_train_batch_size: 16 | ||
per_device_eval_batch_size: 2 | ||
per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node | ||
gradient_accumulation_steps: 4 | ||
push_to_hub: false | ||
remove_unused_columns: true | ||
report_to: | ||
- tensorboard | ||
- wandb | ||
save_strategy: "steps" | ||
save_strategy: "epoch" | ||
save_steps: 100 | ||
save_total_limit: 1 | ||
seed: 42 | ||
warmup_ratio: 0.1 | ||
trust_remote_code: true # only useful for model like phi-3 | ||
|
||
# torch_compile: true | ||
# # https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm']) | ||
# # https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile | ||
# torch_compile_backend: "inductor" | ||
# torch_compile_mode: "default" # reduce-overhead max-autotune |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Model arguments | ||
model_name_or_path: | ||
torch_dtype: null | ||
|
||
# Data training arguments | ||
# For definitions, see: src/h4/training/config.py | ||
dataset_mixer: | ||
HuggingFaceH4/ultrafeedback_binarized: 1.0 | ||
dataset_splits: | ||
- train_prefs | ||
- test_prefs | ||
preprocessing_num_workers: 12 | ||
|
||
# DPOTrainer arguments | ||
bf16: true | ||
beta: 0.01 | ||
do_eval: true | ||
evaluation_strategy: steps | ||
eval_steps: 100 | ||
gradient_accumulation_steps: 2 | ||
gradient_checkpointing: true | ||
gradient_checkpointing_kwargs: | ||
use_reentrant: False | ||
learning_rate: 5.0e-7 | ||
log_level: info | ||
logging_steps: 10 | ||
lr_scheduler_type: cosine | ||
max_length: 1024 | ||
max_prompt_length: 512 | ||
num_train_epochs: 1 | ||
optim: adamw_torch | ||
output_dir: | ||
per_device_train_batch_size: 8 | ||
per_device_eval_batch_size: 8 | ||
save_strategy: "steps" | ||
save_steps: 100 | ||
save_total_limit: 1 | ||
seed: 42 | ||
warmup_ratio: 0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Model arguments | ||
model_name_or_path: | ||
torch_dtype: bfloat16 | ||
use_flash_attention_2: true | ||
|
||
# LoRA arguments | ||
use_peft: true | ||
load_in_4bit: true | ||
lora_r: 128 | ||
lora_alpha: 128 | ||
lora_dropout: 0.05 | ||
lora_target_modules: all | ||
# - q_proj | ||
# - k_proj | ||
# - v_proj | ||
# - o_proj | ||
# - gate_proj | ||
# - up_proj | ||
# - down_proj | ||
|
||
# Data training arguments | ||
|
||
dataset_mixer: | ||
HuggingFaceH4/ultrafeedback_binarized: 1.0 | ||
dataset_splits: | ||
- train_prefs | ||
- test_prefs | ||
preprocessing_num_workers: 12 | ||
|
||
# DPOTrainer arguments | ||
bf16: true | ||
beta: 0.01 | ||
do_eval: true | ||
evaluation_strategy: steps | ||
eval_steps: 100 | ||
gradient_accumulation_steps: 4 | ||
gradient_checkpointing: true | ||
gradient_checkpointing_kwargs: | ||
use_reentrant: false | ||
learning_rate: 5.0e-6 | ||
log_level: info | ||
logging_steps: 10 | ||
lr_scheduler_type: cosine | ||
max_length: 1024 | ||
max_prompt_length: 512 | ||
num_train_epochs: 1 | ||
optim: paged_adamw_32bit | ||
output_dir: | ||
per_device_train_batch_size: 4 | ||
per_device_eval_batch_size: 8 | ||
save_strategy: "steps" | ||
save_steps: 100 | ||
save_total_limit: 1 | ||
seed: 42 | ||
warmup_ratio: 0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Model arguments | ||
model_name_or_path: /home/l069561/project/models/Qwen2-1.5B-Instruct | ||
model_revision: main | ||
torch_dtype: bfloat16 | ||
attn_implementation: flash_attention_2 | ||
|
||
# Data training arguments | ||
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" | ||
dataset_mixer: | ||
HuggingFaceH4/ultrachat_200k: 1.0 | ||
/home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset | ||
/home/l069561/project/data/sang_data_formatted: 1.0 | ||
dataset_splits: | ||
- train_sft | ||
- test_sft | ||
preprocessing_num_workers: 16 | ||
|
||
# SFT trainer config | ||
bf16: true | ||
do_eval: true | ||
# evaluation_strategy: epoch | ||
eval_strategy: epoch | ||
max_grad_norm: 1.0 | ||
# gradient_accumulation_steps: 16 | ||
gradient_checkpointing: true | ||
gradient_checkpointing_kwargs: | ||
use_reentrant: False | ||
log_level: info | ||
logging_steps: 5 | ||
logging_strategy: steps | ||
learning_rate: 2.0e-05 | ||
optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit | ||
optim_target_modules: all-linear | ||
weight_decay: 0.01 | ||
lr_scheduler_type: cosine | ||
max_seq_length: 8192 | ||
packing: false | ||
dataset_num_proc: 16 | ||
max_steps: -1 | ||
num_train_epochs: 3 | ||
output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-qwen2-1.5b-inst-full-ultrachat | ||
overwrite_output_dir: true | ||
per_device_eval_batch_size: 2 | ||
per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node | ||
gradient_accumulation_steps: 4 | ||
push_to_hub: false | ||
remove_unused_columns: true | ||
report_to: | ||
- tensorboard | ||
- wandb | ||
save_strategy: "epoch" | ||
save_steps: 100 | ||
save_total_limit: 3 | ||
seed: 42 | ||
warmup_ratio: 0.1 |
Oops, something went wrong.