Remove prefix lm and denoising (#1065)

* Remove hf_prefix_lm * Remove prefix_lm from mpt modeling * Remove bidirectional mask * Remove text denoising dataloading * Remove adapt tokenizer
mosaicml · Mar 26, 2024 · b71e4b0 · b71e4b0
1 parent ed0647c
commit b71e4b0
Show file tree

Hide file tree

Showing 26 changed files with 24 additions and 1,850 deletions.
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -20,12 +20,10 @@
 hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
 
 from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
-from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator,
-                             NoConcatDataset, Seq2SeqFinetuningCollator,
-                             build_finetuning_dataloader,
-                             build_text_denoising_dataloader)
-from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
-                                  ComposerHFT5)
+from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
+                             Seq2SeqFinetuningCollator,
+                             build_finetuning_dataloader)
+from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
 from llmfoundry.models.layers.attention import (
     MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
     flash_attn_fn, scaled_multihead_dot_product_attention)
@@ -36,9 +34,7 @@
 from llmfoundry.tokenizers import TiktokenTokenizerWrapper
 
 __all__ = [
-    'build_text_denoising_dataloader',
     'build_finetuning_dataloader',
-    'MixtureOfDenoisersCollator',
     'Seq2SeqFinetuningCollator',
     'MPTBlock',
     'FFN_CLASS_REGISTRY',
@@ -50,7 +46,6 @@
     'MPTForCausalLM',
     'ComposerMPTCausalLM',
     'ComposerHFCausalLM',
-    'ComposerHFPrefixLM',
     'ComposerHFT5',
     'scaled_multihead_dot_product_attention',
     'flash_attn_fn',

diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
@@ -3,21 +3,16 @@
 
 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
 from llmfoundry.data.dataloader import build_dataloader
-from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
-                                       build_text_denoising_dataloader)
 from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
                                         build_finetuning_dataloader)
 from llmfoundry.data.text_data import (StreamingTextDataset,
                                        build_text_dataloader)
 from llmfoundry.registry import dataloaders
 
 dataloaders.register('text', func=build_text_dataloader)
-dataloaders.register('text_denoising', func=build_text_denoising_dataloader)
 dataloaders.register('finetuning', func=build_finetuning_dataloader)
 
 __all__ = [
-    'MixtureOfDenoisersCollator',
-    'build_text_denoising_dataloader',
     'Seq2SeqFinetuningCollator',
     'build_finetuning_dataloader',
     'StreamingTextDataset',

diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py
diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py
@@ -331,31 +331,21 @@ def _process_and_batch_decoder_only(
                     self._warned_truncated = True
 
             attention_mask = [1] * len(input_ids)
-            # bidirectional_mask is used by our prefix lm model variants
-            # Note: this will be malformed if any loss-generating tokens are followed by non-loss-generating tokens
-            # (such as in the case of multi-turn chat examples)
-            bidirectional_mask = [
-                1 if label == _HF_IGNORE_INDEX else 0 for label in labels
-            ]
 
             # Annoyingly, we need to pad everything but input_ids
             # and attention_mask ourselves
             n_total = len(input_ids)
             i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total)
-            z_pad = [0] * (self.max_seq_len - n_total)
             if self.tokenizer.padding_side == 'left':
                 labels = i_pad + labels
-                bidirectional_mask = z_pad + bidirectional_mask
             else:
                 labels = labels + i_pad
-                bidirectional_mask = bidirectional_mask + z_pad
 
             # Update the example
             processed_example = {
                 'input_ids': input_ids,
                 'labels': labels,
                 'attention_mask': attention_mask,
-                'bidirectional_mask': bidirectional_mask,
             }
 
             processed_examples.append(processed_example)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -557,15 +557,6 @@ def _build_collate_fn(
                                     1)],
                                              skip_special_tokens=False,
                                              clean_up_tokenization_spaces=True))
-                        print(
-                            '\033[92m{}\033[00m\n'.format('CONTEXT:  '),
-                            tokenizer.decode(batch['input_ids'][
-                                j,
-                                torch.logical_and(
-                                    is_subseq, batch['bidirectional_mask'][j] ==
-                                    1)],
-                                             skip_special_tokens=False,
-                                             clean_up_tokenization_spaces=True))
                         print(
                             '\033[91m{}\033[00m\n'.format('TARGET:   '),
                             tokenizer.decode(batch['input_ids'][
@@ -583,12 +574,6 @@ def _build_collate_fn(
                                                batch['attention_mask'][j] == 1],
                             skip_special_tokens=False,
                             clean_up_tokenization_spaces=True))
-                    print(
-                        '\033[92m{}\033[00m\n'.format('CONTEXT:  '),
-                        tokenizer.decode(batch['input_ids'][
-                            j, batch['bidirectional_mask'][j] == 1],
-                                         skip_special_tokens=False,
-                                         clean_up_tokenization_spaces=True))
                     print(
                         '\033[91m{}\033[00m\n'.format('TARGET:   '),
                         tokenizer.decode(batch['input_ids'][

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -71,7 +71,6 @@ def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
                 'input_ids',
                 'labels',
                 'attention_mask',
-                'bidirectional_mask',
                 'sequence_id',
             ]
         # Cut everything down to size
@@ -278,7 +277,6 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int):
         'input_ids': pad_token_id,
         'labels': -100,
         'attention_mask': 0,
-        'bidirectional_mask': 0,
         'sequence_id': -1,
     }
     keys = packed_examples[0].keys()

diff --git a/llmfoundry/metrics/__init__.py b/llmfoundry/metrics/__init__.py
@@ -43,11 +43,6 @@
     'code_eval_accuracy',
 ]
 
-DEFAULT_PREFIX_LM_METRICS = [
-    'language_cross_entropy',
-    'masked_accuracy',
-]
-
 DEFAULT_ENC_DEC_METRICS = [
     'language_cross_entropy',
     'masked_accuracy',
@@ -66,6 +61,5 @@
     'MaskedAccuracy',
     'DEFAULT_CAUSAL_LM_TRAIN_METRICS',
     'DEFAULT_CAUSAL_LM_EVAL_METRICS',
-    'DEFAULT_PREFIX_LM_METRICS',
     'DEFAULT_ENC_DEC_METRICS',
 ]
diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
@@ -1,8 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
-                                  ComposerHFT5)
+from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
 from llmfoundry.models.inference_api_wrapper import (FMAPICasualLMEvalWrapper,
                                                      FMAPIChatAPIEvalWrapper,
                                                      OpenAICausalLMEvalWrapper,
@@ -13,7 +12,6 @@
 
 models.register('mpt_causal_lm', func=ComposerMPTCausalLM)
 models.register('hf_causal_lm', func=ComposerHFCausalLM)
-models.register('hf_prefix_lm', func=ComposerHFPrefixLM)
 models.register('hf_t5', func=ComposerHFT5)
 models.register('openai_causal_lm', func=OpenAICausalLMEvalWrapper)
 models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
@@ -22,7 +20,6 @@
 
 __all__ = [
     'ComposerHFCausalLM',
-    'ComposerHFPrefixLM',
     'ComposerHFT5',
     'MPTConfig',
     'MPTPreTrainedModel',

diff --git a/llmfoundry/models/hf/__init__.py b/llmfoundry/models/hf/__init__.py
@@ -5,12 +5,10 @@
 from llmfoundry.models.hf.hf_fsdp import (prepare_hf_causal_lm_model_for_fsdp,
                                           prepare_hf_enc_dec_model_for_fsdp,
                                           prepare_hf_model_for_fsdp)
-from llmfoundry.models.hf.hf_prefix_lm import ComposerHFPrefixLM
 from llmfoundry.models.hf.hf_t5 import ComposerHFT5
 
 __all__ = [
     'ComposerHFCausalLM',
-    'ComposerHFPrefixLM',
     'ComposerHFT5',
     'prepare_hf_causal_lm_model_for_fsdp',
     'prepare_hf_enc_dec_model_for_fsdp',

diff --git a/llmfoundry/models/hf/hf_prefix_lm.py b/llmfoundry/models/hf/hf_prefix_lm.py
diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py
@@ -15,8 +15,7 @@
 from llmfoundry.metrics import DEFAULT_ENC_DEC_METRICS
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
-from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
-                                     init_empty_weights)
+from llmfoundry.models.utils import init_empty_weights
 from llmfoundry.utils.warnings import experimental_class
 
 __all__ = ['ComposerHFT5']
@@ -42,12 +41,6 @@ class ComposerHFT5(HuggingFaceModelWithFSDP):
             cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
                 initialize the model on. Currently, `meta` is only supported when
                 cfg.pretrained is ``False``. Default: ``'cpu'``.
-            cfg.adapt_vocab_for_denoising (bool, optional):  Whether to adapt the vocab
-                of the model/tokenizer to include sentinel tokens that are used in denoising
-                tasks like Span Corruption. If you intend to load from an existing Composer
-                checkpoint that was trained on such a task, set this to ``True`` to ensure
-                that the model vocab size matches your checkpoint's vocab size when loading
-                the weights. Default: ``False``.
         tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
     """
 
@@ -84,10 +77,6 @@ def __init__(self, om_model_config: DictConfig,
             raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\
                              f'using configs where `is_encoder_decoder` is ``True``.')
 
-        # Set up the tokenizer (add tokens for denoising sentinels if needed)
-        if om_model_config.get('adapt_vocab_for_denoising', False):
-            adapt_tokenizer_for_denoising(tokenizer)
-
         init_device = om_model_config.get('init_device', 'cpu')
 
         # Get the device we want to initialize, and use the

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
@@ -645,17 +645,16 @@ def __init__(
 
 
 def attn_bias_shape(
-        attn_impl: str, n_heads: int, seq_len: int, alibi: bool,
-        prefix_lm: bool, causal: bool,
+        attn_impl: str, n_heads: int, seq_len: int, alibi: bool, causal: bool,
         use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
     if attn_impl == 'flash':
         return None
     elif attn_impl == 'torch':
         if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
+            if (not causal) or use_sequence_id:
                 return (1, n_heads, seq_len, seq_len)
             return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
+        elif use_sequence_id:
             return (1, 1, seq_len, seq_len)
         return None
     else: