add qwen and phi-3 demo config for sft

xiyang-aads-lilly · Aug 19, 2024 · 4f70851 · 4f70851
1 parent 4c579bc
commit 4f70851
Show file tree

Hide file tree

Showing 14 changed files with 284 additions and 33 deletions.
diff --git a/experiments/demo_magtrain_llm_sft.sh b/experiments/demo_magtrain_llm_sft.sh
@@ -4,6 +4,8 @@ whoami
 pwd
 ds_report
 
+echo $LD_LIBRARY_PATH
+
 HOME=/home/l069561
 
 ROOT=${HOME}/project/alignment-handbook
@@ -14,27 +16,35 @@ source ${SCRIPTPATH}/wandb.sh
 echo $SLURM_TMPDIR
 export TMPDIR="/cache"
 
-export TRITON_CACHE_DIR=${HOME}/project/cache/triton
+export TRITON_HOME=${HOME}/project/cache/triton
+export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache
+export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump
 export HF_DATASETS_CACHE=${HOME}/project/cache/dataset
 export HF_HOME=${HOME}/project/cache/huggingface
 
 # TORCH and NCCL
 export CUDA_LAUNCH_BLOCKING=1
 export TORCH_DISTRIBUTED_DEBUG=INFO
 # export NCCL_DEBUG=INFO
-# export NCCL_SOCKET_NTHREADS=16
+export NCCL_SOCKET_NTHREADS=16
 export DEEPSPEED_TIMEOUT=120
 
 echo $PRIMARY
 echo $PRIMARY_PORT
 
+# TRAIN_CONF=${ROOT}/recipes/llama3-8b/sft/config_full.yaml
+TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml
+# TRAIN_CONF=${ROOT}/recipes/qwen/sft/config_full.yaml
+
+DEEPSPEED_CONF=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json
+
 torchrun \
     --nproc_per_node=$SLURM_GPUS_ON_NODE  \
     --nnode=$SLURM_JOB_NUM_NODES \
     --node_rank=$SLURM_NODEID  \
     --master_addr=$PRIMARY \
     --master_port=$PRIMARY_PORT \
     ${ROOT}/scripts/run_sft.py \
-    ${ROOT}/recipes/llama3-8b/sft/config_full.yaml \
-    --deepspeed=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json \
+    $TRAIN_CONF \
+    --deepspeed=$DEEPSPEED_CONF \
     --tee=2
diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=llm_sft
 #SBATCH --mail-type=ALL
 #SBATCH [email protected]
-#SBATCH --nodes=4
+#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --gpus-per-node=4
 #SBATCH --gpus-per-task=4
@@ -22,7 +22,14 @@ echo $SLURM_JOB_NUM_NODES
 echo $SLURM_GPUS_ON_NODE
 source ${SCRIPTPATH}/util.sh
 
-CONTAINER=${HOME}/container/pt2402.sif
+# CONTAINER=${HOME}/container/pt2402.sif
+CONTAINER=${HOME}/container/pt2402
+
+export TRITON_HOME=${HOME}/project/cache/triton
+export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache
+export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump
+export HF_DATASETS_CACHE=${HOME}/project/cache/dataset
+export HF_HOME=${HOME}/project/cache/huggingface
 
 # srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache  --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
 
@@ -35,7 +42,7 @@ srun --jobid $SLURM_JOB_ID \
     --nic-metrics=true \
     --capture-range=cudaProfilerApi \
     --capture-range-end=stop \
-    -o /cache/nsys_${SLURM_JOB_ID} \
+    -o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \
     bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh
 
 cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/
diff --git a/recipes/llama3-8b/sft/config_full.yaml b/recipes/llama3-8b/sft/config_full.yaml
@@ -1,13 +1,14 @@
 # Model arguments
-model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B
+model_name_or_path: /home/l069561/project/models/Meta-Llama-3.1-8B-Instruct
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 dataset_mixer:
   HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
 dataset_splits:
 - train_sft
 - test_sft
@@ -36,7 +37,7 @@ packing: false
 dataset_num_proc: 16
 max_steps: -1
 num_train_epochs: 3
-output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3-full-ultrachat
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3.1-inst-full-ultrachat
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
 per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node

diff --git a/recipes/phi3/sft/config_full.yaml b/recipes/phi3/sft/config_full.yaml
@@ -1,49 +1,62 @@
 # Model arguments
-model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B
+model_name_or_path: /home/l069561/project/models/Phi-3-small-8k-instruct
 model_revision: main
 torch_dtype: bfloat16
-use_flash_attention_2: true
+attn_implementation: flash_attention_2
 
 # Data training arguments
-chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 dataset_mixer:
   HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
 dataset_splits:
 - train_sft
 - test_sft
-preprocessing_num_workers: 32
+preprocessing_num_workers: 16
+auto_insert_empty_system_msg: true
 
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+# evaluation_strategy: epoch
+eval_strategy: epoch
 max_grad_norm: 1.0
-gradient_accumulation_steps: 1
+# gradient_accumulation_steps: 16
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: False
+  use_reentrant: true # this is required for phi-3 (https://huggingface.co/microsoft/Phi-3-small-8k-instruct/discussions/14)
 log_level: info
 logging_steps: 5
 logging_strategy: steps
-learning_rate: 2.0e-05
-optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+learning_rate: 1.0e-05
+optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
 optim_target_modules: all-linear
 weight_decay: 0.01
 lr_scheduler_type: cosine
-max_seq_length: 4096
+max_seq_length: 8000
+packing: false
+dataset_num_proc: 16
 max_steps: -1
-num_train_epochs: 1
-output_dir: /home/l069561/project/alignment_handbook/experiments/models/llama-3-full-ultrachat
+num_train_epochs: 3
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/phi-3-small-8k-full-ultrachat
 overwrite_output_dir: true
-per_device_eval_batch_size: 8
-per_device_train_batch_size: 16
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
 push_to_hub: false
 remove_unused_columns: true
 report_to:
 - tensorboard
 - wandb
-save_strategy: "steps"
+save_strategy: "epoch"
 save_steps: 100
 save_total_limit: 1
 seed: 42
 warmup_ratio: 0.1
+trust_remote_code: true # only useful for model like phi-3
+
+# torch_compile: true
+# # https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm'])
+# # https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile
+# torch_compile_backend: "inductor"
+# torch_compile_mode: "default" # reduce-overhead max-autotune
diff --git a/recipes/phi3/sft/config_qlora.yaml b/recipes/phi3/sft/config_qlora.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B # no chat template
+model_name_or_path: /home/l069561/project/models/Phi-3-small-8k-instruct # no chat template
 model_revision: main
 torch_dtype: bfloat16
 use_flash_attention_2: true
@@ -21,9 +21,10 @@ lora_target_modules: all
 # - down_proj
 
 # Data training arguments
-chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
+# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}"
 dataset_mixer:
   HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
 dataset_splits:
 - train_sft
 - test_sft
@@ -48,7 +49,7 @@ lr_scheduler_type: cosine
 max_seq_length: 4096
 max_steps: -1
 num_train_epochs: 1
-output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/phi-3-small-8k-full-ultrachat-lora
 overwrite_output_dir: true
 per_device_train_batch_size: 4
 gradient_accumulation_steps: 4

diff --git a/recipes/qwen/dpo/config_full.yaml b/recipes/qwen/dpo/config_full.yaml
@@ -0,0 +1,39 @@
+# Model arguments
+model_name_or_path:
+torch_dtype: null
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir:
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/qwen/dpo/config_qlora.yaml b/recipes/qwen/dpo/config_qlora.yaml
@@ -0,0 +1,55 @@
+# Model arguments
+model_name_or_path:
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# LoRA arguments
+use_peft: true
+load_in_4bit: true
+lora_r: 128
+lora_alpha: 128
+lora_dropout: 0.05
+lora_target_modules: all
+# - q_proj
+# - k_proj
+# - v_proj
+# - o_proj
+# - gate_proj
+# - up_proj
+# - down_proj
+
+# Data training arguments
+
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: paged_adamw_32bit
+output_dir:
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/qwen/sft/config_full.yaml b/recipes/qwen/sft/config_full.yaml
@@ -0,0 +1,55 @@
+# Model arguments
+model_name_or_path: /home/l069561/project/models/Qwen2-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+  /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset
+  /home/l069561/project/data/sang_data_formatted: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 16
+
+# SFT trainer config
+bf16: true
+do_eval: true
+# evaluation_strategy: epoch
+eval_strategy: epoch
+max_grad_norm: 1.0
+# gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+learning_rate: 2.0e-05
+optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
+weight_decay: 0.01
+lr_scheduler_type: cosine
+max_seq_length: 8192
+packing: false
+dataset_num_proc: 16
+max_steps: -1
+num_train_epochs: 3
+output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-qwen2-1.5b-inst-full-ultrachat
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node
+gradient_accumulation_steps: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "epoch"
+save_steps: 100
+save_total_limit: 3
+seed: 42
+warmup_ratio: 0.1