diff --git a/experiments/demo_dgx2.sh b/experiments/demo_dgx2.sh index a0a8aa1e..6e5d8fa4 100644 --- a/experiments/demo_dgx2.sh +++ b/experiments/demo_dgx2.sh @@ -7,8 +7,13 @@ echo activate virtual ENV PYTHON_ENV=${ROOT}/project/scripts/v2306.sh source $PYTHON_ENV + +# CUDA +export CUDA_VISIBLE_DEVICES=0,1 +export CUDA_LAUNCH_BLOCKING="1" + # number of GPUs; here we use all GPUs for demo -WORLD_SIZE=3 +WORLD_SIZE=2 # HF cache export TMPDIR="${ROOT}/project/.cache/" @@ -32,9 +37,17 @@ export ACCELERATE_LOG_LEVEL=debug export ACCELERATE_DEBUG_MODE="1" export DEEPSPEED_TIMEOUT=120 +# accelerate launch accelerate launch \ --config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \ --num_processes $WORLD_SIZE \ --tee 3 \ ${ROOT}/project/alignment_handbook/scripts/run_sft.py \ ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml +# ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_full.yaml + + +# deepspeed launch + + +# torch launch \ No newline at end of file diff --git a/experiments/demo_dgx2_launch.sh b/experiments/demo_dgx2_launch.sh index 637d4bdd..f552b5b3 100644 --- a/experiments/demo_dgx2_launch.sh +++ b/experiments/demo_dgx2_launch.sh @@ -5,9 +5,6 @@ ROOT=$(realpath ~) # singularity container CONTAINER=${ROOT}/project/singularity_containers/py2402.sig -# CUDA -export CUDA_VISIBLE_DEVICES=0,1 - # PATH DEMO_PATH=${ROOT}/project/alignment_handbook/experiments diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh new file mode 100644 index 00000000..e69de29b diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml index d6c76abf..af509c7d 100644 --- a/recipes/accelerate_configs/deepspeed_zero2.yaml +++ b/recipes/accelerate_configs/deepspeed_zero2.yaml @@ -1,15 +1,15 @@ compute_environment: LOCAL_MACHINE debug: true deepspeed_config: - deepspeed_config_file: /home/l069561/project/alignment-handbook/recipes/accelerate_configs/deepspeed_zs2.json - zero3_init_flag: true - # deepspeed_multinode_launcher: standard - # offload_optimizer_device: none - # offload_param_device: none + # deepspeed_config_file: '/home/l069561/project/alignment_handbook/recipes/accelerate_configs/ds_acc_conf.json' # zero3_init_flag: true - # zero3_save_16bit_model: false - # zero_stage: 2 - # mixed_precision: bf16 + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: false + zero_stage: 2 + mixed_precision: bf16 distributed_type: DEEPSPEED downcast_bf16: 'no' machine_rank: 0 diff --git a/recipes/accelerate_configs/deepspeed_zs2.json b/recipes/accelerate_configs/deepspeed_zs2.json index dfa80708..b3347327 100644 --- a/recipes/accelerate_configs/deepspeed_zs2.json +++ b/recipes/accelerate_configs/deepspeed_zs2.json @@ -41,7 +41,7 @@ "reduce_bucket_size": "auto", "contiguous_gradients": true }, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", diff --git a/recipes/accelerate_configs/ds_acc_conf.json b/recipes/accelerate_configs/ds_acc_conf.json new file mode 100644 index 00000000..cea626d6 --- /dev/null +++ b/recipes/accelerate_configs/ds_acc_conf.json @@ -0,0 +1,23 @@ +{ + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 2, + "stage3_gather_16bit_weights_on_model_save": true, + "offload_optimizer": { + "device": "none" + }, + "offload_param": { + "device": "none" + } + }, + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "steps_per_print": 200000, + "fp16": { + "enabled": false + } + } \ No newline at end of file diff --git a/recipes/accelerate_configs/readme.md b/recipes/accelerate_configs/readme.md index 3134d397..083cc2d0 100644 --- a/recipes/accelerate_configs/readme.md +++ b/recipes/accelerate_configs/readme.md @@ -1,7 +1,6 @@ ## deepspeed optimizers - DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, OneBitLamb, FusedLamb, FusedAdam - see for details on how to config https://deepspeed.readthedocs.io/en/latest/optimizers.html -- ```json { "optimizer": { @@ -20,7 +19,7 @@ "factor_min": 0.5, "factor_threshold": 0.1 } - }, + } } { @@ -33,6 +32,25 @@ "max_coeff": 0.3, "min_coeff": 0.01 } - }, + } +} +``` + +- fp16 vs bf16 +```json +{ + "fp16": { + "enabled": false, + "loss_scale": 0, + "auto_cast": false, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "consecutive_hysteresis": false, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + } } ``` \ No newline at end of file diff --git a/recipes/llama3-8b/sft/config_full.yaml b/recipes/llama3-8b/sft/config_full.yaml index 1cafb813..80dbcdbe 100644 --- a/recipes/llama3-8b/sft/config_full.yaml +++ b/recipes/llama3-8b/sft/config_full.yaml @@ -11,7 +11,7 @@ dataset_mixer: dataset_splits: - train_sft - test_sft -preprocessing_num_workers: 8 +preprocessing_num_workers: 32 # SFT trainer config bf16: true @@ -27,6 +27,7 @@ logging_steps: 5 logging_strategy: steps learning_rate: 2.0e-05 optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +optim_target_modules: all-linear weight_decay: 0.01 lr_scheduler_type: cosine max_seq_length: 4096 diff --git a/recipes/llama3-8b/sft/config_qlora.yaml b/recipes/llama3-8b/sft/config_qlora.yaml index d2e19745..c1465d93 100644 --- a/recipes/llama3-8b/sft/config_qlora.yaml +++ b/recipes/llama3-8b/sft/config_qlora.yaml @@ -5,8 +5,9 @@ torch_dtype: bfloat16 use_flash_attention_2: true # LoRA arguments -load_in_4bit: true +use_unsloth: false # unsloth not support deepspeed yet use_peft: true +load_in_4bit: true lora_r: 32 lora_alpha: 32 lora_dropout: 0.05 @@ -26,14 +27,13 @@ dataset_mixer: dataset_splits: - train_sft - test_sft -preprocessing_num_workers: 16 +preprocessing_num_workers: 32 auto_insert_empty_system_msg: true # SFT trainer config bf16: true do_eval: true evaluation_strategy: epoch -gradient_accumulation_steps: 16 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false @@ -41,18 +41,22 @@ learning_rate: 1.0e-04 log_level: info logging_steps: 5 logging_strategy: steps +optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit +# optim_target_modules: all-linear +weight_decay: 0.01 lr_scheduler_type: cosine -max_seq_length: 4096 +max_seq_length: 2048 max_steps: -1 num_train_epochs: 1 output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat overwrite_output_dir: true -per_device_eval_batch_size: 8 +per_device_eval_batch_size: 2 +gradient_accumulation_steps: 32 per_device_train_batch_size: 4 push_to_hub: false report_to: - tensorboard -# - wandb +- wandb save_strategy: "steps" save_steps: 100 save_total_limit: 1 diff --git a/requirements.txt b/requirements.txt index f66031f2..a83745a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,7 @@ flash-attn>=2.1.0 pynvml>=11.4.0 # optional -galore-torch \ No newline at end of file +galore-torch + +# unsloth +# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps \ No newline at end of file diff --git a/scripts/run_sft.py b/scripts/run_sft.py index d7bf4b4d..3a7879b1 100644 --- a/scripts/run_sft.py +++ b/scripts/run_sft.py @@ -141,6 +141,7 @@ def main(): if ( "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path + and not model_args.use_unsloth ): model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, **model_kwargs @@ -151,6 +152,7 @@ def main(): ##################### # Apply chat template ##################### + logger.info("*** apply chat template ***") raw_datasets = raw_datasets.map( apply_chat_template, fn_kwargs={ @@ -192,15 +194,23 @@ def main(): ######################## # Initialize the Trainer ######################## + if model_args.use_unsloth: + logger.info("*** use unsloth ***") from alignment.unsloth import get_unsloth_peft_model peft_config = get_peft_config(model_args) - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, **model_kwargs + model, tokenizer = get_unsloth_peft_model( + model_args.model_name_or_path, + training_args.max_seq_length, + peft_config.to_dict(), ) - model, tokenizer = setup_chat_format(model, tokenizer) - model = get_unsloth_peft_model(model, training_args.max_seq_length, peft_config) + + if ( + "<|im_start|>" in tokenizer.chat_template + and "gemma-tokenizer-chatml" not in tokenizer.name_or_path + ): + model, tokenizer = setup_chat_format(model, tokenizer) trainer = SFTTrainer( model=model, @@ -208,6 +218,7 @@ def main(): train_dataset=train_dataset, eval_dataset=eval_dataset, dataset_text_field="text", + dataset_num_proc=data_args.preprocessing_num_workers, max_seq_length=training_args.max_seq_length, tokenizer=tokenizer, packing=True, @@ -222,6 +233,7 @@ def main(): train_dataset=train_dataset, eval_dataset=eval_dataset, dataset_text_field="text", + dataset_num_proc=data_args.preprocessing_num_workers, max_seq_length=training_args.max_seq_length, tokenizer=tokenizer, packing=True, diff --git a/src/alignment/unsloth.py b/src/alignment/unsloth.py index 5d1f7369..44bc42e9 100644 --- a/src/alignment/unsloth.py +++ b/src/alignment/unsloth.py @@ -66,18 +66,32 @@ def load_unsloth_pretrained_model(config, model_args): return model -def get_unsloth_peft_model(model, max_seq_length, peft_kwargs: Dict[str, Any]): +def get_unsloth_peft_model(model_name, max_seq_length, peft_kwargs: Dict[str, Any]): r""" Gets the peft model for the pretrained model with unsloth. Used in training. """ from unsloth import FastLanguageModel + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=model_name, + max_seq_length=max_seq_length, + dtype=None, + load_in_4bit=True, + ) + unsloth_peft_kwargs = { "model": model, "max_seq_length": max_seq_length, "use_gradient_checkpointing": "unsloth", } - return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) + + peft_kwargs["lora_dropout"] = 0.0 + peft_kwargs.pop("task_type", None) + + return ( + FastLanguageModel.get_peft_model(**unsloth_peft_kwargs, **peft_kwargs), + tokenizer, + ) def load_unsloth_peft_model(config, model_args, is_trainable: bool):