diff --git a/experiments/demo_magtrain_llm_sft.sh b/experiments/demo_magtrain_llm_sft.sh index 191daa47..85208ab6 100644 --- a/experiments/demo_magtrain_llm_sft.sh +++ b/experiments/demo_magtrain_llm_sft.sh @@ -1,5 +1,4 @@ #!/usr/bin/bash - whoami pwd ds_report @@ -33,8 +32,14 @@ echo $PRIMARY echo $PRIMARY_PORT # TRAIN_CONF=${ROOT}/recipes/llama3-8b/sft/config_full.yaml -TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml +# TRAIN_CONF=${ROOT}/recipes/phi3/sft/config_full.yaml # TRAIN_CONF=${ROOT}/recipes/qwen/sft/config_full.yaml +# TRAIN_CONF=${ROOT}/recipes/falcon_mamba/sft/config_full.yaml # need futher debug, training stuck + +# manually set +export WANDB_PROJECT="sang" +# TRAIN_CONF=${ROOT}/recipes/sang_project/config_full_1.yaml +TRAIN_CONF=${ROOT}/recipes/sang_project/config_full_2.yaml DEEPSPEED_CONF=${ROOT}/recipes/accelerate_configs/deepspeed_zs2.json diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh index 6185e8e8..3fa035f8 100644 --- a/experiments/demo_magtrain_slurm.sh +++ b/experiments/demo_magtrain_slurm.sh @@ -3,11 +3,11 @@ #SBATCH --job-name=llm_sft #SBATCH --mail-type=ALL #SBATCH --mail-user=xi.yang5@lilly.com -#SBATCH --nodes=1 +#SBATCH --nodes=4 #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-node=4 #SBATCH --gpus-per-task=4 -#SBATCH --cpus-per-task=64 +#SBATCH --cpus-per-task=32 #SBATCH --mem=512gb #SBATCH --time=48:00:00 #SBATCH --output=/home/l069561/project/log/alignment/sft_%j.out @@ -17,13 +17,13 @@ HOME=/home/l069561 SCRIPTPATH=${HOME}/project/alignment-handbook/experiments echo $SCRIPTPATH -echo $SLURM_NTASKS_PER_NODE echo $SLURM_JOB_NUM_NODES +echo $SLURM_NTASKS_PER_NODE echo $SLURM_GPUS_ON_NODE source ${SCRIPTPATH}/util.sh -# CONTAINER=${HOME}/container/pt2402.sif -CONTAINER=${HOME}/container/pt2402 +CONTAINER=${HOME}/container/pt2402.sif +# CONTAINER=${HOME}/container/pt2402 export TRITON_HOME=${HOME}/project/cache/triton export TRITON_CACHE_DIR=${HOME}/project/cache/triton/cache @@ -31,18 +31,18 @@ export TRITON_DUMP_DIR=${HOME}/project/cache/triton/dump export HF_DATASETS_CACHE=${HOME}/project/cache/dataset export HF_HOME=${HOME}/project/cache/huggingface -# srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh +srun --jobid $SLURM_JOB_ID apptainer exec -B $SLURM_TMPDIR:/cache --nv $CONTAINER bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh # use nsys to profile training process -srun --jobid $SLURM_JOB_ID \ - apptainer exec -B $SLURM_TMPDIR:/cache --nv --fakeroot $CONTAINER \ - nsys profile -s none -t cuda,nvtx \ - --gpu-metrics-device=all \ - --gpu-metrics-frequency=100 \ - --nic-metrics=true \ - --capture-range=cudaProfilerApi \ - --capture-range-end=stop \ - -o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \ - bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh - -cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/ +# srun --jobid $SLURM_JOB_ID \ +# apptainer exec -B $SLURM_TMPDIR:/cache --nv --fakeroot $CONTAINER \ +# nsys profile -s none -t cuda,nvtx \ +# --gpu-metrics-device=all \ +# --gpu-metrics-frequency=100 \ +# --nic-metrics=true \ +# --capture-range=cudaProfilerApi \ +# --capture-range-end=stop \ +# -o $SLURM_TMPDIR/nsys_${SLURM_JOB_ID} \ +# bash ${SCRIPTPATH}/demo_magtrain_llm_sft.sh + +# cp $SLURM_TMPDIR/nsys_${SLURM_JOB_ID}.nsys-rep ${HOME}/project/log/nsys/ diff --git a/recipes/falcon_mamba/sft/config_full.yaml b/recipes/falcon_mamba/sft/config_full.yaml new file mode 100644 index 00000000..77cf8ccc --- /dev/null +++ b/recipes/falcon_mamba/sft/config_full.yaml @@ -0,0 +1,56 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct +model_revision: main +torch_dtype: bfloat16 +# attn_implementation: flash_attention_2 + +# Data training arguments +# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 + /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset + /home/l069561/project/data/sang_data_formatted: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 16 + +# SFT trainer config +bf16: true +do_eval: true +# evaluation_strategy: epoch +eval_strategy: steps +eval_steps: 1000 +max_grad_norm: 1.0 +# gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +log_level: info +logging_steps: 5 +logging_strategy: steps +learning_rate: 1.0e-05 +optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 8192 +packing: false +dataset_num_proc: 16 +max_steps: -1 +num_train_epochs: 3 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-falcon-mamba-7b-inst-full-ultrachat +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node +gradient_accumulation_steps: 4 +push_to_hub: false +remove_unused_columns: true +report_to: +- tensorboard +- wandb +save_strategy: "epoch" +save_steps: 100 +save_total_limit: 3 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/falcon_mamba/sft/config_qlora.yaml b/recipes/falcon_mamba/sft/config_qlora.yaml new file mode 100644 index 00000000..5b9794aa --- /dev/null +++ b/recipes/falcon_mamba/sft/config_qlora.yaml @@ -0,0 +1,70 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct # no chat template +model_revision: main +torch_dtype: bfloat16 +use_flash_attention_2: true + +# LoRA arguments +use_unsloth: false # unsloth not support deepspeed yet +use_peft: true +load_in_4bit: true +lora_r: 32 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: all +# - q_proj +# - k_proj +# - v_proj +# - o_proj +# - gate_proj +# - up_proj +# - down_proj + +# Data training arguments +# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}" +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 16 +auto_insert_empty_system_msg: true + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: epoch +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +learning_rate: 1.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused +# optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 4096 +max_steps: -1 +num_train_epochs: 1 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-falcon-mamba-7b-inst-qlora-ultrachat +overwrite_output_dir: true +per_device_train_batch_size: 4 +gradient_accumulation_steps: 4 +per_device_eval_batch_size: 4 +push_to_hub: false +report_to: +- tensorboard +- wandb +save_strategy: "steps" +save_steps: 100 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 + +torch_compile: false +# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm']) +# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile +torch_compile_backend: "inductor" +torch_compile_mode: "default" # reduce-overhead max-autotune diff --git a/recipes/gemma/sft/config_full.yaml b/recipes/gemma/sft/config_full.yaml new file mode 100644 index 00000000..77cf8ccc --- /dev/null +++ b/recipes/gemma/sft/config_full.yaml @@ -0,0 +1,56 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct +model_revision: main +torch_dtype: bfloat16 +# attn_implementation: flash_attention_2 + +# Data training arguments +# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 + /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset + /home/l069561/project/data/sang_data_formatted: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 16 + +# SFT trainer config +bf16: true +do_eval: true +# evaluation_strategy: epoch +eval_strategy: steps +eval_steps: 1000 +max_grad_norm: 1.0 +# gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +log_level: info +logging_steps: 5 +logging_strategy: steps +learning_rate: 1.0e-05 +optim: adamw_torch #galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 8192 +packing: false +dataset_num_proc: 16 +max_steps: -1 +num_train_epochs: 3 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/models-falcon-mamba-7b-inst-full-ultrachat +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node +gradient_accumulation_steps: 4 +push_to_hub: false +remove_unused_columns: true +report_to: +- tensorboard +- wandb +save_strategy: "epoch" +save_steps: 100 +save_total_limit: 3 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/gemma/sft/config_qlora.yaml b/recipes/gemma/sft/config_qlora.yaml new file mode 100644 index 00000000..5b9794aa --- /dev/null +++ b/recipes/gemma/sft/config_qlora.yaml @@ -0,0 +1,70 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/falcon-mamba-7b-instruct # no chat template +model_revision: main +torch_dtype: bfloat16 +use_flash_attention_2: true + +# LoRA arguments +use_unsloth: false # unsloth not support deepspeed yet +use_peft: true +load_in_4bit: true +lora_r: 32 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: all +# - q_proj +# - k_proj +# - v_proj +# - o_proj +# - gate_proj +# - up_proj +# - down_proj + +# Data training arguments +# chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}" +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 16 +auto_insert_empty_system_msg: true + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: epoch +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +learning_rate: 1.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused +# optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 4096 +max_steps: -1 +num_train_epochs: 1 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-falcon-mamba-7b-inst-qlora-ultrachat +overwrite_output_dir: true +per_device_train_batch_size: 4 +gradient_accumulation_steps: 4 +per_device_eval_batch_size: 4 +push_to_hub: false +report_to: +- tensorboard +- wandb +save_strategy: "steps" +save_steps: 100 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 + +torch_compile: false +# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm']) +# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile +torch_compile_backend: "inductor" +torch_compile_mode: "default" # reduce-overhead max-autotune diff --git a/recipes/llama3-8b/sft/config_full.yaml b/recipes/llama3-8b/sft/config_full.yaml index 69610497..47aea00a 100644 --- a/recipes/llama3-8b/sft/config_full.yaml +++ b/recipes/llama3-8b/sft/config_full.yaml @@ -9,6 +9,7 @@ attn_implementation: flash_attention_2 dataset_mixer: HuggingFaceH4/ultrachat_200k: 1.0 /home/l069561/project/alignment-handbook/experiments/extra_sample_training_data: 1.0 # test local dataset + # /home/l069561/project/data/sang_data_formatted: 1.0 dataset_splits: - train_sft - test_sft @@ -27,7 +28,7 @@ gradient_checkpointing_kwargs: log_level: info logging_steps: 5 logging_strategy: steps -learning_rate: 2.0e-05 +learning_rate: 1.0e-05 optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit optim_target_modules: all-linear weight_decay: 0.01 @@ -36,7 +37,7 @@ max_seq_length: 8192 packing: false dataset_num_proc: 16 max_steps: -1 -num_train_epochs: 3 +num_train_epochs: 1 output_dir: /home/l069561/project/alignment-handbook/experiments/models/llama-3.1-inst-full-ultrachat overwrite_output_dir: true per_device_eval_batch_size: 1 diff --git a/recipes/sang_project/config_full_1.yaml b/recipes/sang_project/config_full_1.yaml new file mode 100644 index 00000000..717b5e84 --- /dev/null +++ b/recipes/sang_project/config_full_1.yaml @@ -0,0 +1,54 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/gemma-2-2b +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +chat_template: "{% if messages[0]['role'] == 'system' %}{% set system_message = '### System Instruction: ' + messages[0]['content'] | trim + '' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Context: ' + message['content'] | trim + '' }}{% elif message['role'] == 'assistant' %}{{ '### Result: ' + message['content'] | trim + eos_token + '' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '### Result: ' }}{% endif %}" +dataset_mixer: + /home/l069561/project/data/processed_data_open_sourced_xml_to_text/merged_open_sourced_xml_to_text_dataset: 1.0 + # /home/l069561/project/data/sang_data_formatted: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 4 + +# SFT trainer config +bf16: true +do_eval: true +# evaluation_strategy: epoch +eval_strategy: epoch +max_grad_norm: 1.0 +# gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +log_level: info +logging_steps: 5 +logging_strategy: steps +learning_rate: 2.0e-05 +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 8192 +packing: false +dataset_num_proc: 16 +max_steps: -1 +num_train_epochs: 2 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/sang_exp1_stage1_gemma-2-2b_full +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node +gradient_accumulation_steps: 4 +push_to_hub: false +remove_unused_columns: true +report_to: +- tensorboard +- wandb +save_strategy: "steps" +save_steps: 2000 +save_total_limit: 10 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/sang_project/config_full_2.yaml b/recipes/sang_project/config_full_2.yaml new file mode 100644 index 00000000..67b604bd --- /dev/null +++ b/recipes/sang_project/config_full_2.yaml @@ -0,0 +1,55 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/Qwen2-1.5B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +# qwen does not use bos token - https://github.com/QwenLM/Qwen2/issues/486 +chat_template: "{% if messages[0]['role'] == 'system' %}{% set system_message = '### Instruction: ' + messages[0]['content'] | trim + '\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{ system_message }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### XML Data:\n' + message['content'] | trim + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Summary: ' + message['content'] | trim + eos_token + '' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '### Summary: ' }}{% endif %}" +dataset_mixer: + /home/l069561/project/data/processed_data_open_sourced_xml_to_text/merged_open_sourced_xml_to_text_dataset: 1.0 + # /home/l069561/project/data/sang_data_formatted: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 8 +dataset_num_proc: 8 +packing: false + +# SFT trainer config +bf16: true +do_eval: true +# evaluation_strategy: epoch +eval_strategy: epoch +max_grad_norm: 1.0 +# gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +log_level: info +logging_steps: 5 +logging_strategy: steps +learning_rate: 1.0e-05 +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: linear +max_seq_length: 8192 +max_steps: -1 +num_train_epochs: 2 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/sang_exp1_stage1_qwen-2b_full +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 # this is per device, you need to manual calculate global batch by per device * gas * gpu * node +gradient_accumulation_steps: 4 +push_to_hub: false +remove_unused_columns: true +report_to: +- tensorboard +- wandb +save_strategy: "steps" +save_steps: 1500 +save_total_limit: 10 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/sang_project/config_qlora.yaml b/recipes/sang_project/config_qlora.yaml new file mode 100644 index 00000000..4f343a39 --- /dev/null +++ b/recipes/sang_project/config_qlora.yaml @@ -0,0 +1,70 @@ +# Model arguments +model_name_or_path: /home/l069561/project/models/Meta-Llama-3-8B # no chat template +model_revision: main +torch_dtype: bfloat16 +use_flash_attention_2: true + +# LoRA arguments +use_unsloth: false # unsloth not support deepspeed yet +use_peft: true +load_in_4bit: true +lora_r: 32 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: all +# - q_proj +# - k_proj +# - v_proj +# - o_proj +# - gate_proj +# - up_proj +# - down_proj + +# Data training arguments +chat_template: "{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'].strip() + '<|im_end|>\\n' }}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}{% endfor %}" +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 16 +auto_insert_empty_system_msg: true + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: epoch +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +learning_rate: 1.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit adamw_apex_fused +# optim_target_modules: all-linear +weight_decay: 0.01 +lr_scheduler_type: cosine +max_seq_length: 4096 +max_steps: -1 +num_train_epochs: 1 +output_dir: /home/l069561/project/alignment-handbook/experiments/models/demo-llama-3-8b-qlora-ultrachat +overwrite_output_dir: true +per_device_train_batch_size: 4 +gradient_accumulation_steps: 4 +per_device_eval_batch_size: 4 +push_to_hub: false +report_to: +- tensorboard +- wandb +save_strategy: "steps" +save_steps: 100 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 + +torch_compile: false +# https://pytorch.org/docs/stable/generated/torch.compile.html ('cudagraphs', 'inductor', 'onnxrt', 'openxla', 'openxla_eval', 'tvm']) +# https://huggingface.co/docs/transformers/perf_train_gpu_one#using-torchcompile +torch_compile_backend: "inductor" +torch_compile_mode: "default" # reduce-overhead max-autotune diff --git a/scripts/run_sft.py b/scripts/run_sft.py index c9bfbbd4..8c74f20d 100644 --- a/scripts/run_sft.py +++ b/scripts/run_sft.py @@ -83,11 +83,6 @@ def main(): logger.info(f"Data parameters {data_args}") logger.info(f"Training/evaluation parameters {training_args}") - # Check for last checkpoint - last_checkpoint = get_checkpoint(training_args) - if last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.") - ############### # Load datasets ############### @@ -170,6 +165,10 @@ def main(): train_dataset = raw_datasets["train"] eval_dataset = raw_datasets["test"] + # this is hard coded + training_args.dataset_text_field = "text" + + # # no need for logging samples # with training_args.main_process_first( # desc="Log a few random samples from the processed training set" # ): @@ -219,7 +218,6 @@ def main(): args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, - dataset_text_field="text", tokenizer=tokenizer, dataset_kwargs=training_args.dataset_kwargs, callbacks=[GpuUtilPrintCallBack()], @@ -230,7 +228,6 @@ def main(): args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, - dataset_text_field="text", tokenizer=tokenizer, peft_config=get_peft_config(model_args), dataset_kwargs=training_args.dataset_kwargs, @@ -242,11 +239,14 @@ def main(): ############### logger.info("*** Train ***") + # Check for last checkpoint + last_checkpoint = get_checkpoint(training_args) checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint + logger.info(f"Checkpoint detected, resuming training at {checkpoint}.") train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py index edfbe565..0d7d1e16 100644 --- a/src/alignment/model_utils.py +++ b/src/alignment/model_utils.py @@ -94,13 +94,13 @@ def tokenizer_and_embedding_resize( ) tokenizer.add_special_tokens({k: v}) - model.resize_token_embeddings(len(tokenizer)) + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) model.get_input_embeddings().weight.data[-1] = tk_emb # add non special extra tokens if non_special_tokens_to_add: num_new_tokens = tokenizer.add_tokens(non_special_tokens_to_add) - model.resize_token_embeddings(len(tokenizer)) + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) if num_new_tokens > 0: input_embeddings_data = model.get_input_embeddings().weight.data output_embeddings_data = model.get_output_embeddings().weight.data @@ -151,6 +151,8 @@ def get_tokenizer( elif auto_set_chat_template and tokenizer.get_chat_template() is None: tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE + tokenizer.pad_to_multiple_of = 8 + return tokenizer diff --git a/src/alignment/utils.py b/src/alignment/utils.py index 487ee003..06a1eefa 100644 --- a/src/alignment/utils.py +++ b/src/alignment/utils.py @@ -1,15 +1,10 @@ +from datetime import datetime + from transformers import TrainerCallback from pynvml import * -class GpuUtilPrintCallBack(TrainerCallback): - def on_log(self, args, state, control, logs=None, **kwargs): - if state.is_local_process_zero: - print(logs) - print_gpu_utilization() - - def print_gpu_utilization(): nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) @@ -23,6 +18,15 @@ def print_summary(result): print_gpu_utilization() +class GpuUtilPrintCallBack(TrainerCallback): + def on_log(self, args, state, control, logs=None, **kwargs): + if state.is_local_process_zero: + print(datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")) + print(logs) + print_gpu_utilization() + # print_summary(args) + + class ProfCallback(TrainerCallback): def __init__(self, prof): self.prof = prof