From e8cbf50be698413f205bd7a603c01e2aa1d83231 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 27 Sep 2023 11:12:08 -0400 Subject: [PATCH] attention_mask not needed for training (#642) * attention_mask not needed for training * specifically don't use attention mask for phi * use a different check for phi * small fixes since phi removed some values from their config --- src/axolotl/models/phi/modeling_mixformer_sequential.py | 8 ++------ src/axolotl/utils/data.py | 2 +- src/axolotl/utils/trainer.py | 9 ++++++++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/axolotl/models/phi/modeling_mixformer_sequential.py b/src/axolotl/models/phi/modeling_mixformer_sequential.py index 27bede5f01..fd2ec054c5 100644 --- a/src/axolotl/models/phi/modeling_mixformer_sequential.py +++ b/src/axolotl/models/phi/modeling_mixformer_sequential.py @@ -711,12 +711,8 @@ def __init__( self.resid_dropout = nn.Dropout(config.resid_pdrop) self.block_idx = block_idx - self.mixer = MHA(config=config, **mixer, layer_idx=block_idx) - mlp_cls = mlp.pop("mlp_cls") - if mlp_cls == "fused_mlp": - self.mlp = FusedMLP(config=config, **mlp) - else: - self.mlp = MLP(config=config, **mlp) + self.mixer = MHA(config, layer_idx=block_idx) + self.mlp = MLP(config) def forward( self, diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 2dc2d82b28..9792371c7c 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -76,7 +76,7 @@ def prepare_dataset(cfg, tokenizer): with zero_first(is_main_process()): train_dataset, eval_dataset = process_datasets_for_packing( - cfg, train_dataset, eval_dataset + cfg, train_dataset, eval_dataset, tokenizer ) if cfg.max_steps: total_num_steps = min( diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index a2657e05ab..aee2a1b99e 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -397,7 +397,7 @@ def disable_datasets_caching(): set_caching_enabled(True) -def process_datasets_for_packing(cfg, train_dataset, eval_dataset): +def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len) with zero_first(is_main_process()): train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count()) @@ -414,6 +414,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset): eval_dataset = eval_dataset.map( add_position_ids, num_proc=os.cpu_count() ) + + # Phi doesn't want the attention_mask feature when training + if "CodeGenTokenizer" in tokenizer.__class__.__name__: + train_dataset = train_dataset.remove_columns("attention_mask") + if eval_dataset: + eval_dataset = eval_dataset.remove_columns("attention_mask") + return train_dataset, eval_dataset