Phi2 multipack (#1173)

* phi2 multipack * update validation and examples for phi * more updates to phi examples * make sure to use the correct collator for phi multipack * phi needs attention mask now for multipack * if the special token already exists in the tokenizer, don't require in lora modules to save * fix qlora yml for phi, fix phi test validation * test qlora too * make sure flash attention is enabled for the test * don't use remote code for phi anymore * reduce sequence len for sample packing phi
axolotl-ai-cloud · Jan 23, 2024 · b3dc698 · b3dc698
1 parent 6a0d3e0
commit b3dc698
Show file tree

Hide file tree

Showing 18 changed files with 201 additions and 2,269 deletions.
diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
-model_type: PhiForCausalLM
+model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true
 
 load_in_8bit: false
 load_in_4bit: false
@@ -18,7 +16,7 @@ output_dir: ./phi-sft-out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len:
+pad_to_sequence_len: true
 
 adapter:
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:
 
 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 
-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true
 
 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
   pad_token: "<|endoftext|>"
diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true
 
 load_in_8bit: false
 load_in_4bit: true
@@ -16,9 +14,9 @@ dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./phi-sft-out
 
-sequence_len: 1024
-sample_packing: false  # not CURRENTLY compatible with LoRAs
-pad_to_sequence_len:
+sequence_len: 2048
+sample_packing: true
+pad_to_sequence_len: true
 
 adapter: qlora
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:
 
 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 
-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true
 
 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
   pad_token: "<|endoftext|>"
diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-2
-model_revision:  834565c  # pin model repo to the previous architecture
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-trust_remote_code: true
 
 load_in_8bit: false
 load_in_4bit: false
@@ -17,19 +15,16 @@ val_set_size: 0.05
 output_dir: ./phi-sft-out
 
 sequence_len: 2048
-sample_packing: false  # currently unsupported
-pad_to_sequence_len:
+sample_packing: true
+pad_to_sequence_len: true
 
 adapter:
 lora_model_dir:
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.1
-lora_target_linear: true
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
 lora_fan_in_fan_out:
-lora_modules_to_save:
-  - embd
-  - lm_head
 
 wandb_project:
 wandb_entity:
@@ -38,14 +33,14 @@ wandb_name:
 wandb_log_model:
 
 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 adam_beta2: 0.95
 adam_epsilon: 0.00001
 max_grad_norm: 1.0
 lr_scheduler: cosine
-learning_rate: 1e-5
+learning_rate: 0.000003
 
 train_on_inputs: false
 group_by_length: false
@@ -54,6 +49,8 @@ fp16:
 tf32: true
 
 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:

diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
@@ -930,7 +930,7 @@ def build_collator(
             ]
         ]
         if use_batch_sampler_collator:
-            if self.cfg.model_config_type in ["mixtral", "qwen2"]:
+            if self.cfg.model_config_type in ["mixtral", "qwen2", "falcon", "phi"]:
                 collator = V2BatchSamplerDataCollatorForSeq2Seq
             else:
                 collator = BatchSamplerDataCollatorForSeq2Seq

diff --git a/src/axolotl/models/phi/__init__.py b/src/axolotl/models/phi/__init__.py
diff --git a/src/axolotl/models/phi/configuration_mixformer_sequential.py b/src/axolotl/models/phi/configuration_mixformer_sequential.py
diff --git a/src/axolotl/models/phi/configuration_phi.py b/src/axolotl/models/phi/configuration_phi.py