Merge branch 'main' into add_finetuning_example_2

mosaicml · Feb 6, 2024 · 9f63c87 · 9f63c87
2 parents af1e6c9 + 105f766
commit 9f63c87
Show file tree

Hide file tree

Showing 27 changed files with 11,965 additions and 696 deletions.
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -95,4 +95,4 @@
     'TiktokenTokenizerWrapper',
 ]
 
-__version__ = '0.4.0'
+__version__ = '0.5.0'
diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py
@@ -5,9 +5,7 @@
     from llmfoundry.callbacks.async_eval_callback import AsyncEval
     from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet
     from llmfoundry.callbacks.fdiff_callback import FDiffMetrics
-    from llmfoundry.callbacks.generate_callback import Generate
     from llmfoundry.callbacks.hf_checkpointer import HuggingFaceCheckpointer
-    from llmfoundry.callbacks.model_gauntlet_callback import ModelGauntlet
     from llmfoundry.callbacks.monolithic_ckpt_callback import \
         MonolithicCheckpointSaver
     from llmfoundry.callbacks.resumption_callbacks import (GlobalLRScaling,
@@ -21,13 +19,11 @@
 
 __all__ = [
     'FDiffMetrics',
-    'Generate',
     'MonolithicCheckpointSaver',
     'GlobalLRScaling',
     'LayerFreezing',
     'ScheduledGarbageCollector',
     'EvalGauntlet',
-    'ModelGauntlet',
     'HuggingFaceCheckpointer',
     'AsyncEval',
 ]
diff --git a/llmfoundry/callbacks/generate_callback.py b/llmfoundry/callbacks/generate_callback.py
diff --git a/llmfoundry/callbacks/model_gauntlet_callback.py b/llmfoundry/callbacks/model_gauntlet_callback.py
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -232,12 +232,12 @@ def tokenize_formatted_example(
 
 def is_valid_ift_example(pad_token_id: int, max_seq_len: int,
                          example: Dict) -> bool:
-    """Check if it's an valid ift example.
+    """Check if the example is a valid ift example.
 
     This functions does the following check:
-    a. Length of input_ids should less than max_seq_len
+    a. Length of input_ids should be less than max_seq_len
     b. Both input_ids and labels should not be empty
-    c. Labels should has at least 1 non-padding token.
+    c. Labels should have at least 1 non-padding token.
 
     Args:
         pad_token_id (int): The id of the padding token.

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-import os
 import tempfile
 from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
 
@@ -426,105 +425,3 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]:
     for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
         padding, waste = profile(raw_batch_size)
         yield (packing_ratio, padding, waste)
-
-
-if __name__ == '__main__':
-
-    import warnings
-
-    warnings.warn(
-        DeprecationWarning(
-            'Please use scripts/misc/profile_packing.py to profile packing.' +
-            'This script will be removed in later releases.'))
-
-    import os
-    from argparse import ArgumentParser, Namespace
-
-    from omegaconf import OmegaConf as om
-
-    from llmfoundry.utils import build_tokenizer
-
-    def parse_args() -> Namespace:
-        """Parse commandline arguments."""
-        parser = ArgumentParser(
-            description=
-            'Profile packing_ratio choices for a particular workload.')
-        parser.add_argument(
-            '--yaml-path',
-            type=str,
-            required=True,
-            help='Path to the YAML that defines the workload to profile.')
-        parser.add_argument('--num-devices',
-                            type=int,
-                            default=None,
-                            help='How many devices your run will use.')
-        parser.add_argument('--min',
-                            type=float,
-                            required=True,
-                            help='Smallest packing_ratio to test. Must be >=1.')
-        parser.add_argument(
-            '--max',
-            type=float,
-            required=True,
-            help='Largest packing_ratio to test. Must be larger than `min`.')
-        parser.add_argument(
-            '--num-packing-ratios',
-            type=int,
-            default=20,
-            help=
-            'Number of packing_ratio values (spaced between `min` and `max) to try.'
-        )
-
-        args = parser.parse_args()
-
-        if not os.path.isfile(args.yaml_path):
-            raise FileNotFoundError(
-                '`yaml_path` does not correspond to any existing file.')
-        if args.num_devices < 1:
-            raise ValueError('`num_devices` must be a positive integer.')
-        if args.min < 1.0:
-            raise ValueError('`min` must be >=1.0.')
-        if args.max < args.min:
-            raise ValueError('`max` cannot be less than `min`.')
-        if args.num_packing_ratios < 1:
-            raise ValueError('`num_packing_ratios` must be a positive integer.')
-        return args
-
-    args = parse_args()
-
-    with open(args.yaml_path) as f:
-        cfg = om.load(f)
-    if 'parameters' in cfg:
-        cfg = om.to_container(cfg.parameters)
-        cfg = om.create(cfg)
-    device_batch_size = cfg.global_train_batch_size // args.num_devices
-
-    # Fetch a bunch of raw examples once, which we'll re-use
-    if 'train_loader' not in cfg:
-        raise ValueError('config must define train_loader')
-    dataloader_cfg = cfg.train_loader
-
-    # build tokenizer
-    if 'tokenizer' not in cfg:
-        raise ValueError('config must define tokenizer')
-
-    resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True)
-    if not isinstance(resolved_tokenizer_cfg, Dict):
-        raise ValueError(
-            'tokenizer config needs to be resolved by omegaconf into a Dict.')
-    tokenizer_cfg = resolved_tokenizer_cfg
-
-    tokenizer_name = tokenizer_cfg['name']
-    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
-    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
-
-    results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max,
-                              args.num_packing_ratios, device_batch_size)
-
-    header = '\n\n\n packing_ratio | % PADDING | % WASTE'
-    fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
-
-    print(header)
-    print('-' * len(header))
-    for packing_ratio, padding, waste in results:
-        print(fstr.format(packing_ratio, padding, waste))
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
@@ -108,13 +108,6 @@ def __init__(self,
                  batching_method: str = 'random',
                  **kwargs: Any):
 
-        group_method = kwargs.pop('group_method', None)
-        if group_method is not None:
-            raise NotImplementedError(
-                'group_method is deprecated and has been removed.\nTo ' +
-                'concatenate, use the --concat_tokens ' +
-                'argument when creating your MDS dataset with concat_c4.py')
-
         if len(kwargs) > 0:
             raise ValueError(
                 f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}'
@@ -245,12 +238,6 @@ def build_text_dataloader(
     device_batch_size: int,
 ) -> DataSpec:
     assert cfg.name == 'text', f'Tried to build text dataloader with cfg.name={cfg.name}'
-    if cfg.dataset.get('group_method', None) is not None:
-        raise NotImplementedError(
-            'group_method is deprecated and has been removed.\nTo ' +
-            'concatenate, use the --concat_tokens ' +
-            'argument when creating your MDS dataset with convert_dataset_hf.py'
-        )
 
     # get kwargs
     streams_dict = cfg.dataset.pop('streams', None)

diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
@@ -246,7 +246,6 @@ class OpenAICausalLMEvalWrapper(OpenAIEvalInterface):
 
     def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
         super().__init__(model_cfg, tokenizer)
-        # TODO: this will be deprecated
         self.generate_completion = lambda prompt, num_tokens: self.client.completions.create(
             model=self.model_name,
             prompt=prompt,

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
@@ -90,7 +90,7 @@ def scaled_multihead_dot_product_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
-    kv_n_heads: Optional[int] = None,
+    kv_n_heads: int,
     past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
@@ -99,21 +99,8 @@ def scaled_multihead_dot_product_attention(
     dropout_p: float = 0.0,
     training: bool = False,
     needs_weights: bool = False,
-    multiquery: bool = False,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
-    if multiquery:
-        warnings.warn(
-            DeprecationWarning(
-                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
-            ))
-        kv_n_heads = 1
-    elif kv_n_heads is None:
-        warnings.warn(
-            DeprecationWarning(
-                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
-            ))
-        kv_n_heads = n_heads
 
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
     k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
@@ -218,7 +205,7 @@ def flash_attn_fn(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
-    kv_n_heads: Optional[int] = None,
+    kv_n_heads: int,
     past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
@@ -247,19 +234,6 @@ def flash_attn_fn(
 
     check_valid_inputs(query, key, value)
 
-    if multiquery:
-        warnings.warn(
-            DeprecationWarning(
-                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
-            ))
-        kv_n_heads = 1
-    elif kv_n_heads is None:
-        warnings.warn(
-            DeprecationWarning(
-                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
-            ))
-        kv_n_heads = n_heads
-
     if past_key_value is not None:
         if len(past_key_value) != 0:
             key = torch.cat([past_key_value[0], key], dim=1)
@@ -379,7 +353,7 @@ def triton_flash_attn_fn(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
-    kv_n_heads: Optional[int] = None,
+    kv_n_heads: int,
     past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
@@ -388,7 +362,6 @@ def triton_flash_attn_fn(
     dropout_p: float = 0.0,
     training: bool = False,
     needs_weights: bool = False,
-    multiquery: bool = False,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
@@ -420,19 +393,6 @@ def triton_flash_attn_fn(
 
     check_valid_inputs(query, key, value)
 
-    if multiquery:
-        warnings.warn(
-            DeprecationWarning(
-                'The direct use of the multiquery arg is deprecated. Setting kv_n_heads=1 automatically. Please set kv_n_heads=1 explicitly to remove this warning.'
-            ))
-        kv_n_heads = 1
-    elif kv_n_heads is None:
-        warnings.warn(
-            DeprecationWarning(
-                'Not specifying a value for the kv_n_heads arg is deprecated. Setting kv_n_heads=n_heads automatically. Please set kv_n_heads=n_heads explicitly to remove this warning.'
-            ))
-        kv_n_heads = n_heads
-
     if past_key_value is not None:
         if len(past_key_value) != 0:
             key = torch.cat([past_key_value[0], key], dim=1)

diff --git a/llmfoundry/models/layers/llama_attention_monkeypatch.py b/llmfoundry/models/layers/llama_attention_monkeypatch.py
@@ -78,8 +78,6 @@ def llama_attention_patch_torch(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
-    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(
@@ -188,8 +186,6 @@ def llama_attention_patch_triton(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
-    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(