Skip to content

Commit

Permalink
Remove prefix lm and denoising (#1065)
Browse files Browse the repository at this point in the history
* Remove hf_prefix_lm

* Remove prefix_lm from mpt modeling

* Remove bidirectional mask

* Remove text denoising dataloading

* Remove adapt tokenizer
  • Loading branch information
irenedea authored Mar 26, 2024
1 parent ed0647c commit b71e4b0
Show file tree
Hide file tree
Showing 26 changed files with 24 additions and 1,850 deletions.
13 changes: 4 additions & 9 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@
hf_dynamic_modules_logger.addFilter(new_files_warning_filter)

from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator,
NoConcatDataset, Seq2SeqFinetuningCollator,
build_finetuning_dataloader,
build_text_denoising_dataloader)
from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
ComposerHFT5)
from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
Seq2SeqFinetuningCollator,
build_finetuning_dataloader)
from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, scaled_multihead_dot_product_attention)
Expand All @@ -36,9 +34,7 @@
from llmfoundry.tokenizers import TiktokenTokenizerWrapper

__all__ = [
'build_text_denoising_dataloader',
'build_finetuning_dataloader',
'MixtureOfDenoisersCollator',
'Seq2SeqFinetuningCollator',
'MPTBlock',
'FFN_CLASS_REGISTRY',
Expand All @@ -50,7 +46,6 @@
'MPTForCausalLM',
'ComposerMPTCausalLM',
'ComposerHFCausalLM',
'ComposerHFPrefixLM',
'ComposerHFT5',
'scaled_multihead_dot_product_attention',
'flash_attn_fn',
Expand Down
5 changes: 0 additions & 5 deletions llmfoundry/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,16 @@

from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
from llmfoundry.data.dataloader import build_dataloader
from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
build_text_denoising_dataloader)
from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
build_finetuning_dataloader)
from llmfoundry.data.text_data import (StreamingTextDataset,
build_text_dataloader)
from llmfoundry.registry import dataloaders

dataloaders.register('text', func=build_text_dataloader)
dataloaders.register('text_denoising', func=build_text_denoising_dataloader)
dataloaders.register('finetuning', func=build_finetuning_dataloader)

__all__ = [
'MixtureOfDenoisersCollator',
'build_text_denoising_dataloader',
'Seq2SeqFinetuningCollator',
'build_finetuning_dataloader',
'StreamingTextDataset',
Expand Down
957 changes: 0 additions & 957 deletions llmfoundry/data/denoising.py

This file was deleted.

10 changes: 0 additions & 10 deletions llmfoundry/data/finetuning/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,31 +331,21 @@ def _process_and_batch_decoder_only(
self._warned_truncated = True

attention_mask = [1] * len(input_ids)
# bidirectional_mask is used by our prefix lm model variants
# Note: this will be malformed if any loss-generating tokens are followed by non-loss-generating tokens
# (such as in the case of multi-turn chat examples)
bidirectional_mask = [
1 if label == _HF_IGNORE_INDEX else 0 for label in labels
]

# Annoyingly, we need to pad everything but input_ids
# and attention_mask ourselves
n_total = len(input_ids)
i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total)
z_pad = [0] * (self.max_seq_len - n_total)
if self.tokenizer.padding_side == 'left':
labels = i_pad + labels
bidirectional_mask = z_pad + bidirectional_mask
else:
labels = labels + i_pad
bidirectional_mask = bidirectional_mask + z_pad

# Update the example
processed_example = {
'input_ids': input_ids,
'labels': labels,
'attention_mask': attention_mask,
'bidirectional_mask': bidirectional_mask,
}

processed_examples.append(processed_example)
Expand Down
15 changes: 0 additions & 15 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,15 +557,6 @@ def _build_collate_fn(
1)],
skip_special_tokens=False,
clean_up_tokenization_spaces=True))
print(
'\033[92m{}\033[00m\n'.format('CONTEXT: '),
tokenizer.decode(batch['input_ids'][
j,
torch.logical_and(
is_subseq, batch['bidirectional_mask'][j] ==
1)],
skip_special_tokens=False,
clean_up_tokenization_spaces=True))
print(
'\033[91m{}\033[00m\n'.format('TARGET: '),
tokenizer.decode(batch['input_ids'][
Expand All @@ -583,12 +574,6 @@ def _build_collate_fn(
batch['attention_mask'][j] == 1],
skip_special_tokens=False,
clean_up_tokenization_spaces=True))
print(
'\033[92m{}\033[00m\n'.format('CONTEXT: '),
tokenizer.decode(batch['input_ids'][
j, batch['bidirectional_mask'][j] == 1],
skip_special_tokens=False,
clean_up_tokenization_spaces=True))
print(
'\033[91m{}\033[00m\n'.format('TARGET: '),
tokenizer.decode(batch['input_ids'][
Expand Down
2 changes: 0 additions & 2 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
'input_ids',
'labels',
'attention_mask',
'bidirectional_mask',
'sequence_id',
]
# Cut everything down to size
Expand Down Expand Up @@ -278,7 +277,6 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int):
'input_ids': pad_token_id,
'labels': -100,
'attention_mask': 0,
'bidirectional_mask': 0,
'sequence_id': -1,
}
keys = packed_examples[0].keys()
Expand Down
6 changes: 0 additions & 6 deletions llmfoundry/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@
'code_eval_accuracy',
]

DEFAULT_PREFIX_LM_METRICS = [
'language_cross_entropy',
'masked_accuracy',
]

DEFAULT_ENC_DEC_METRICS = [
'language_cross_entropy',
'masked_accuracy',
Expand All @@ -66,6 +61,5 @@
'MaskedAccuracy',
'DEFAULT_CAUSAL_LM_TRAIN_METRICS',
'DEFAULT_CAUSAL_LM_EVAL_METRICS',
'DEFAULT_PREFIX_LM_METRICS',
'DEFAULT_ENC_DEC_METRICS',
]
5 changes: 1 addition & 4 deletions llmfoundry/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
ComposerHFT5)
from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
from llmfoundry.models.inference_api_wrapper import (FMAPICasualLMEvalWrapper,
FMAPIChatAPIEvalWrapper,
OpenAICausalLMEvalWrapper,
Expand All @@ -13,7 +12,6 @@

models.register('mpt_causal_lm', func=ComposerMPTCausalLM)
models.register('hf_causal_lm', func=ComposerHFCausalLM)
models.register('hf_prefix_lm', func=ComposerHFPrefixLM)
models.register('hf_t5', func=ComposerHFT5)
models.register('openai_causal_lm', func=OpenAICausalLMEvalWrapper)
models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
Expand All @@ -22,7 +20,6 @@

__all__ = [
'ComposerHFCausalLM',
'ComposerHFPrefixLM',
'ComposerHFT5',
'MPTConfig',
'MPTPreTrainedModel',
Expand Down
2 changes: 0 additions & 2 deletions llmfoundry/models/hf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
from llmfoundry.models.hf.hf_fsdp import (prepare_hf_causal_lm_model_for_fsdp,
prepare_hf_enc_dec_model_for_fsdp,
prepare_hf_model_for_fsdp)
from llmfoundry.models.hf.hf_prefix_lm import ComposerHFPrefixLM
from llmfoundry.models.hf.hf_t5 import ComposerHFT5

__all__ = [
'ComposerHFCausalLM',
'ComposerHFPrefixLM',
'ComposerHFT5',
'prepare_hf_causal_lm_model_for_fsdp',
'prepare_hf_enc_dec_model_for_fsdp',
Expand Down
144 changes: 0 additions & 144 deletions llmfoundry/models/hf/hf_prefix_lm.py

This file was deleted.

13 changes: 1 addition & 12 deletions llmfoundry/models/hf/hf_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from llmfoundry.metrics import DEFAULT_ENC_DEC_METRICS
from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
init_empty_weights)
from llmfoundry.models.utils import init_empty_weights
from llmfoundry.utils.warnings import experimental_class

__all__ = ['ComposerHFT5']
Expand All @@ -42,12 +41,6 @@ class ComposerHFT5(HuggingFaceModelWithFSDP):
cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
initialize the model on. Currently, `meta` is only supported when
cfg.pretrained is ``False``. Default: ``'cpu'``.
cfg.adapt_vocab_for_denoising (bool, optional): Whether to adapt the vocab
of the model/tokenizer to include sentinel tokens that are used in denoising
tasks like Span Corruption. If you intend to load from an existing Composer
checkpoint that was trained on such a task, set this to ``True`` to ensure
that the model vocab size matches your checkpoint's vocab size when loading
the weights. Default: ``False``.
tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
"""

Expand Down Expand Up @@ -84,10 +77,6 @@ def __init__(self, om_model_config: DictConfig,
raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\
f'using configs where `is_encoder_decoder` is ``True``.')

# Set up the tokenizer (add tokens for denoising sentinels if needed)
if om_model_config.get('adapt_vocab_for_denoising', False):
adapt_tokenizer_for_denoising(tokenizer)

init_device = om_model_config.get('init_device', 'cpu')

# Get the device we want to initialize, and use the
Expand Down
7 changes: 3 additions & 4 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,17 +645,16 @@ def __init__(


def attn_bias_shape(
attn_impl: str, n_heads: int, seq_len: int, alibi: bool,
prefix_lm: bool, causal: bool,
attn_impl: str, n_heads: int, seq_len: int, alibi: bool, causal: bool,
use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
if attn_impl == 'flash':
return None
elif attn_impl == 'torch':
if alibi:
if (prefix_lm or not causal) or use_sequence_id:
if (not causal) or use_sequence_id:
return (1, n_heads, seq_len, seq_len)
return (1, n_heads, 1, seq_len)
elif prefix_lm or use_sequence_id:
elif use_sequence_id:
return (1, 1, seq_len, seq_len)
return None
else:
Expand Down
Loading

0 comments on commit b71e4b0

Please sign in to comment.