From 0e84450ff26dcdf197782cc3f8a1107956c528eb Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Tue, 24 Oct 2023 03:53:13 +0000
Subject: [PATCH 01/49] fix attention

---
 llmfoundry/models/layers/attention.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index c8d578cb2d..596d6e7004 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -675,8 +675,6 @@ def __init__(
             kv_n_heads=n_heads,  # for MHA, same # heads as kv groups
             attn_impl=attn_impl,
             clip_qkv=clip_qkv,
-            tensor_parallel_qkvo=tensor_parallel_qkvo,
-            tp_world_size=tp_world_size,
             qk_ln=qk_ln,
             tensor_parallel_qkvo=tensor_parallel_qkvo,
             tp_world_size=tp_world_size,

From d72902aea49260bbeb82e70f9c7412d6d585522c Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 24 Oct 2023 11:25:35 -0700
Subject: [PATCH 02/49] Allow flash attention 2 and upgrade to transformers
 4.34.1 (#672)

* more special casing in tokenizer equivalence check
* fix addedtoken -> str
* add lazy load option
* add gc collect
* updates for the patch release
* add documentation for flash attention options
---
 llmfoundry/__init__.py               |  11 +-
 llmfoundry/models/hf/hf_causal_lm.py |  22 ++-
 llmfoundry/tokenizers/tiktoken.py    |   4 +-
 scripts/train/README.md              |  62 ++++++++-
 scripts/train/train.py               |   8 ++
 setup.py                             |   9 +-
 tests/test_hf_conversion_script.py   |  43 ++++++
 tests/test_huggingface_flash.py      | 195 +++++++++++++++++++++++++++
 tests/test_llama_patch.py            |  95 -------------
 9 files changed, 337 insertions(+), 112 deletions(-)
 create mode 100644 tests/test_huggingface_flash.py
 delete mode 100644 tests/test_llama_patch.py

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index 3bb9eed043..51fa67993a 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -4,6 +4,11 @@
 import torch
 
 try:
+    # Before importing any transformers models, we need to disable transformers flash attention if
+    # we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+    # gated import otherwise.
+    import transformers
+
     from llmfoundry import optim, utils
     from llmfoundry.data import (ConcatTokensDataset,
                                  MixtureOfDenoisersCollator, NoConcatDataset,
@@ -14,8 +19,8 @@
                                       ComposerHFT5)
     from llmfoundry.models.layers.attention import (
         MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
-        flash_attn_fn, scaled_multihead_dot_product_attention,
-        triton_flash_attn_fn)
+        flash_attn_fn, is_flash_v1_installed,
+        scaled_multihead_dot_product_attention, triton_flash_attn_fn)
     from llmfoundry.models.layers.blocks import MPTBlock
     from llmfoundry.models.layers.ffn import (FFN_CLASS_REGISTRY, MPTMLP,
                                               build_ffn)
@@ -24,6 +29,8 @@
                                        MPTForCausalLM, MPTModel,
                                        MPTPreTrainedModel)
     from llmfoundry.tokenizers import TiktokenTokenizerWrapper
+    if is_flash_v1_installed():
+        transformers.utils.is_flash_attn_available = lambda: False
 
 except ImportError as e:
     try:
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 13857e9bb9..eb90b07045 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -24,8 +24,7 @@
 
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
-from llmfoundry.models.layers.llama_attention_monkeypatch import \
-    get_llama_attention_patch_fn
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.utils import init_empty_weights
 
 try:
@@ -95,12 +94,28 @@ def __init__(self, om_model_config: Union[DictConfig,
             # load the model config
             trust_remote_code = om_model_config.get('trust_remote_code', True)
             use_auth_token = om_model_config.get('use_auth_token', False)
+            use_flash_attention_2 = om_model_config.get('use_flash_attention_2',
+                                                        False)
+            if use_flash_attention_2 and not is_flash_v2_installed():
+                raise ValueError(
+                    'use_flash_attention_2 is set to True, but flash-attention 2 is not installed. '
+                    + 'Please install flash_attn==2.3.2`.')
+
             config = AutoConfig.from_pretrained(
                 om_model_config.pretrained_model_name_or_path,
                 trust_remote_code=trust_remote_code,
                 use_auth_token=use_auth_token,
             )
 
+            # This is not how you are supposed to set this, but transformers currently only
+            # supports enabling flash attention 2 when using the from_pretrained API.
+            # We need to support it for both from_pretrained and from_config, so we have to
+            # set the private attribute here. This will just skip all of transformers'
+            # validation logic that it is ok to use flash attention 2, so we check
+            # whether it is installed above, and whether the chosen config supports it here.
+            # https://github.com/huggingface/transformers/issues/26878
+            config._flash_attn_2_enabled = use_flash_attention_2
+
             # set config overrides
             for k, v in om_model_config.get('config_overrides', {}).items():
                 if not hasattr(config, k):
@@ -200,6 +215,9 @@ def __init__(self, om_model_config: Union[DictConfig,
                 )
                 from transformers.models.llama.modeling_llama import \
                     LlamaAttention
+
+                from llmfoundry.models.layers.llama_attention_monkeypatch import \
+                    get_llama_attention_patch_fn
                 LlamaAttention.forward = get_llama_attention_patch_fn(
                     attention_patch_type)
                 model.config.use_cache = False
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 41518a582a..45192e09dd 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -155,7 +155,7 @@ def convert_ids_to_tokens(
         """
         if isinstance(ids, int):
             if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
+                return str(self.added_tokens_decoder[ids])
 
             return self._convert_id_to_token(ids)
 
@@ -171,7 +171,7 @@ def convert_ids_to_tokens(
             if index in self.added_tokens_decoder:
                 tokens.append(self.encoding.decode(current_stream))
                 current_stream = []
-                tokens.append(self.added_tokens_decoder[index])
+                tokens.append(str(self.added_tokens_decoder[index]))
             else:
                 current_stream.append(index)
 
diff --git a/scripts/train/README.md b/scripts/train/README.md
index f10fdf59f0..4c706dc040 100644
--- a/scripts/train/README.md
+++ b/scripts/train/README.md
@@ -5,14 +5,15 @@ This README walks through pretraining and finetuning a large language model usin
 #### Table of Contents
 1. [Part 1: LLM Pretraining](#llmpretraining)
    1. [Installation](#installation)
-   2. [Dataset Preparation](#datasetpreparation)
-   3. [How to start single and multi-node pretraining](#howtostartpretraining)
-2. [Part 2: LLM Finetuning](#llmfinetuning)
+   1. [Dataset Preparation](#datasetpreparation)
+   1. [How to start single and multi-node pretraining](#howtostartpretraining)
+1. [Part 2: LLM Finetuning](#llmfinetuning)
    1. [Using a dataset on the HuggingFace Hub](#hfdataset)
-   2. [Using a local dataset](#localdataset)
-   3. [Using a StreamingDataset (MDS) formatted dataset locally or in an object store](#mdsdataset)
-3. [FAQ: How many GPUs do I need to train a LLM?](#howmandygpus)
-4. [FAQ: Optimizing Performance](#optimizingperformance)
+   1. [Using a local dataset](#localdataset)
+   1. [Using a StreamingDataset (MDS) formatted dataset locally or in an object store](#mdsdataset)
+1. [Using Flash Attention](#flashattention)
+1. [FAQ: How many GPUs do I need to train a LLM?](#howmandygpus)
+1. [FAQ: Optimizing Performance](#optimizingperformance)
 
 # Part 1: LLM Pretraining <a name="llmpretraining"></a>
 
@@ -332,6 +333,53 @@ train_loader:
         ...
 ```
 
+# Using Flash Attention <a name="flashattention"></a>
+
+Flash Attention is an optimized implementation of the attention mechanism, first introduced by [Dao et al.](https://github.com/Dao-AILab/flash-attention). There are three versions of Flash Attention that can be used with LLM Foundry: Flash Attention V1, Flash Attention V2, and a Triton implementation of Flash Attention. To start, we recommend using one of our [provided Docker images](../../README.md#mosaicml-docker-images) corresponding to the Flash Attention version you would like to use. The Triton implementation can be used with either Flash Attention V1 or V2. Next, how you specify to use Flash Attention depends on which model you are using.
+
+For MPT, you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: mpt_causal_lm
+    ...
+    attn_config:
+        # Will use either V1 or V2 depending on what is installed
+        # "triton" will use the Triton implementation
+        attn_impl: flash
+        ...
+```
+
+If loading MPT from the HuggingFace Hub, you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    pretrained_model_name_or_path: mosaicml/mpt-7b
+    ...
+    config_overrides:
+        # Will use either V1 or V2 depending on what is installed
+        # "triton" will use the Triton implementation
+        attn_config:
+            attn_impl: flash
+        ...
+```
+
+For any HuggingFace model that supports Flash Attention (e.g. Llama and Mistral), you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    use_flash_attention_2: True # Will be automatically set to True if Flash Attention V2 is installed and the model supports it
+    ...
+```
+HuggingFace models currently only support Flash Attention V2.
+
+For Llama specifically, we have another option if you would like to use the Triton implementation of Flash Attention. You can specify this in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
+    attention_patch_type: triton
+    ...
+```
 
 # FAQ: How many GPUs do I need to train a LLM? <a name="howmanygpus"></a>
 This is a complicated question in general, but if we assume that you are using FSDP with `FULL_SHARD`,
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 28ecb68e34..8c1c28eb5c 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 import copy
+import gc
 import logging
 import os
 import sys
@@ -216,6 +217,12 @@ def main(cfg: DictConfig) -> Trainer:
         os.environ[
             'PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_split_size_mb}'
 
+    # Set CUDA lazy loading
+    # This can save a bit of memory if not all modules are needed
+    cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', True)
+    if cuda_load_lazy:
+        os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
+
     # Set seed first
     seed: int = pop_config(cfg, 'seed', must_exist=True)
     reproducibility.seed_all(seed)
@@ -634,6 +641,7 @@ def main(cfg: DictConfig) -> Trainer:
     print('Logging config')
     log_config(logged_cfg)
     torch.cuda.empty_cache()
+    gc.collect()
 
     # Eval first if requested
     if eval_first and trainer.state.timestamp.batch.value == 0:
diff --git a/setup.py b/setup.py
index d0ecc66160..63aac9d752 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
 install_requires = [
     'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17',
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
-    'transformers>=4.33,<4.34',
+    'transformers>=4.34.1,<4.35',
     'mosaicml-streaming>=0.6,<0.7',
     'torch>=1.13.1,<2.1.1',
     'datasets>=2.14.5,<2.15',
@@ -114,9 +114,10 @@
 extra_deps['all-cpu'] = set(
     dep for key, deps in extra_deps.items() for dep in deps if 'gpu' not in key)
 extra_deps['all'] = set(dep for key, deps in extra_deps.items() for dep in deps
-                        if key != 'gpu-flash2')
-extra_deps['all-flash2'] = set(
-    dep for key, deps in extra_deps.items() for dep in deps if key != 'gpu')
+                        if key not in {'gpu-flash2', 'all-cpu'})
+extra_deps['all-flash2'] = set(dep for key, deps in extra_deps.items()
+                               for dep in deps
+                               if key not in {'gpu', 'all', 'all-cpu'})
 
 setup(
     name=_PACKAGE_NAME,
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index e7787754de..fcb2cc3a7e 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -138,6 +138,49 @@ def check_hf_tokenizer_equivalence(tokenizer1: PreTrainedTokenizerBase,
     tokenizer1.__dict__['init_kwargs'].pop('auto_map', None)
     tokenizer2.__dict__['init_kwargs'].pop('auto_map', None)
 
+    # Additional special tokens do not match between original tokenizer and loaded tokenizer due to transformers
+    # constructor differences
+    additional_special_tokens_1 = {
+        t if isinstance(t, str) else t.content
+        for t in tokenizer1.__dict__.pop('_additional_special_tokens', [])
+    }
+    additional_special_tokens_2 = {
+        t if isinstance(t, str) else t.content
+        for t in tokenizer2.__dict__.pop('_additional_special_tokens', [])
+    }
+    # Also pop it out of init_kwargs
+    tokenizer1.__dict__['init_kwargs'].pop('additional_special_tokens', None)
+    tokenizer2.__dict__['init_kwargs'].pop('additional_special_tokens', None)
+    tokenizer1.__dict__['init_kwargs'].pop('added_tokens_decoder', None)
+    tokenizer2.__dict__['init_kwargs'].pop('added_tokens_decoder', None)
+    # If the additional special tokens are the same (or a subset of each other), or if one of them is empty, then we are good
+    assert additional_special_tokens_1.issubset(
+        additional_special_tokens_2) or additional_special_tokens_2.issubset(
+            additional_special_tokens_1)
+
+    # The special token attributes may be strings or they may be AddedToken objects, so we just check string values
+    # First check that they have the same attrs
+    assert tokenizer1.SPECIAL_TOKENS_ATTRIBUTES == tokenizer2.SPECIAL_TOKENS_ATTRIBUTES
+    # Then check that the values are the same
+    for special_token_attr in tokenizer1.SPECIAL_TOKENS_ATTRIBUTES:
+        # Skip additional_special_tokens because we already checked it above
+        if special_token_attr == 'additional_special_tokens':
+            continue
+
+        # The init_kwargs can change between the original tokenizer and the loaded tokenizer,
+        # so we just pop them
+        tokenizer1.__dict__['init_kwargs'].pop(special_token_attr, None)
+        tokenizer2.__dict__['init_kwargs'].pop(special_token_attr, None)
+
+        attr1 = tokenizer1.__dict__.pop('_' + special_token_attr, None)
+        attr2 = tokenizer2.__dict__.pop('_' + special_token_attr, None)
+        if attr1 is None and attr2 is None:
+            continue
+
+        attr_value1 = attr1 if isinstance(attr1, str) else attr1.content
+        attr_value2 = attr2 if isinstance(attr2, str) else attr2.content
+        assert attr_value1 == attr_value2
+
     assert tokenizer1.__dict__ == tokenizer2.__dict__
 
 
diff --git a/tests/test_huggingface_flash.py b/tests/test_huggingface_flash.py
new file mode 100644
index 0000000000..a71217ea1f
--- /dev/null
+++ b/tests/test_huggingface_flash.py
@@ -0,0 +1,195 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+import os
+from unittest.mock import patch
+
+import pytest
+import torch
+import transformers
+from composer.core.precision import get_precision_context
+from composer.utils import reproducibility
+from omegaconf import OmegaConf as om
+
+from llmfoundry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.models.hf.hf_fsdp import rgetattr
+from llmfoundry.models.layers.attention import (is_flash_v1_installed,
+                                                is_flash_v2_installed)
+from llmfoundry.utils.builders import build_tokenizer
+
+# Before importing any transformers models, we need to disable transformers flash attention if
+# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+# gated import otherwise.
+if is_flash_v1_installed():
+    transformers.utils.is_flash_attn_available = lambda: False
+
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+from llmfoundry.models.layers.llama_attention_monkeypatch import (
+    llama_attention_patch_torch, llama_attention_patch_triton)
+
+
+@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
+@pytest.mark.parametrize('explicit_mask', [True, False])
+@pytest.mark.parametrize(
+    'model_name', ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-2-70b-hf'])
+@pytest.mark.gpu
+def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
+                           model_name: str):
+    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+        pytest.skip(
+            'The CI cluster does not have access to the Llama models, so skip this test.'
+        )
+
+    device = 'cuda:0'
+    sequence_length = 4096
+    model_dim = 4096 if '7b' in model_name else 8192
+    batch_size = 2
+    if patch_fn_name == 'torch':
+        patch_fn = llama_attention_patch_torch
+        dtype = torch.float32
+        atol = 0.0
+        rtol = 0.0
+    elif patch_fn_name == 'triton':
+        # the huggingface implementation of llama performs the softmax in fp32
+        # this can result in fairly large differences for the triton implementation
+        # but the torch implementation produces the exact same output so we can confirm
+        # the implementation is correct
+        patch_fn = llama_attention_patch_triton
+        dtype = torch.bfloat16
+        atol = 1e-2
+        rtol = 1e-2
+    else:
+        raise ValueError(f'Unknown patch_fn_name: {patch_fn_name}')
+
+    llama_config = transformers.AutoConfig.from_pretrained(model_name,
+                                                           use_auth_token=True)
+
+    reproducibility.seed_all(42)
+    attention = LlamaAttention(config=llama_config,)
+    attention.to(dtype=dtype, device=device)
+
+    rng = torch.Generator(device=device).manual_seed(42)
+    hidden_states = torch.randn(batch_size,
+                                sequence_length,
+                                model_dim,
+                                generator=rng,
+                                dtype=dtype,
+                                device=device)
+    causal_mask = torch.full((sequence_length, sequence_length),
+                             torch.finfo(torch.float32).min,
+                             device=device)
+    causal_mask = causal_mask.triu(diagonal=1)
+    causal_mask = causal_mask[None,
+                              None, :, :].expand(batch_size, 1, sequence_length,
+                                                 sequence_length)
+    attn_output, _, _ = attention(
+        hidden_states=hidden_states,
+        attention_mask=causal_mask if explicit_mask else None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache=False,
+    )
+
+    reproducibility.seed_all(42)
+    with patch.object(LlamaAttention, 'forward', new=patch_fn):
+        attention = LlamaAttention(config=llama_config,)
+        attention.to(dtype=dtype, device=device)
+        new_output, _, _ = attention(
+            hidden_states=hidden_states,
+            attention_mask=causal_mask if explicit_mask else None,
+            position_ids=None,
+            past_key_value=None,
+            use_cache=False,
+        )
+
+    assert torch.allclose(attn_output, new_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('model_name', ['llama2', 'mistral'])
+@pytest.mark.parametrize('use_flash_attention_2', [True, False])
+def test_flash2(model_name: str, use_flash_attention_2: bool):
+    if model_name == 'llama2':
+        if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+            pytest.skip(
+                'The CI cluster does not have access to the Llama models, so skip this test.'
+            )
+        model_cfg = {
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf',
+            'config_overrides': {
+                'num_hidden_layers': 2,
+                'intermediate_size': 64,
+            },
+            'use_auth_token': True,
+            'pretrained': False,
+            'init_device': 'cpu',
+        }
+
+        tokenizer_name = 'meta-llama/Llama-2-7b-hf'
+        from transformers.models.llama.modeling_llama import (
+            LlamaAttention, LlamaFlashAttention2)
+        flash_attn_class = LlamaFlashAttention2 if use_flash_attention_2 else LlamaAttention
+        attention_layers_attr = 'model.model.layers'
+        attention_attr = 'self_attn'
+    elif model_name == 'mistral':
+        model_cfg = {
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+            'config_overrides': {
+                'num_hidden_layers': 2,
+                'intermediate_size': 64,
+            },
+            'pretrained': False,
+            'init_device': 'cpu',
+        }
+
+        tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+        from transformers.models.mistral.modeling_mistral import (
+            MistralAttention, MistralFlashAttention2)
+        flash_attn_class = MistralFlashAttention2 if use_flash_attention_2 else MistralAttention
+        attention_layers_attr = 'model.model.layers'
+        attention_attr = 'self_attn'
+    else:
+        raise ValueError(f'Unknown model: {model_name}')
+
+    if use_flash_attention_2:
+        model_cfg['use_flash_attention_2'] = True
+
+    model_cfg = om.create(model_cfg)
+
+    tokenizer = build_tokenizer(
+        tokenizer_name=tokenizer_name,
+        tokenizer_kwargs={'model_max_length': 10},
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+
+    error_context = pytest.raises(
+        ValueError, match='use_flash_attention_2 is set to True'
+    ) if not is_flash_v2_installed(
+    ) and use_flash_attention_2 else contextlib.nullcontext()
+
+    with error_context:
+        model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg, tokenizer)
+
+        # check that it actually used flash attention 2
+        assert model.model.config._flash_attn_2_enabled if use_flash_attention_2 else not model.model.config._flash_attn_2_enabled
+        attention_layer = rgetattr(
+            rgetattr(model, attention_layers_attr)[0], attention_attr)
+        assert isinstance(attention_layer, flash_attn_class)
+
+        tokenized_input = tokenizer(['Hello world blah blah', 'Goodbye world'],
+                                    return_tensors='pt',
+                                    padding=True)
+        tokenized_input['labels'] = tokenized_input['input_ids'].clone()
+
+        tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()}
+        model.to('cuda')
+
+        with get_precision_context('amp_bf16'):
+            # We're just testing that flash attention 2 runs okay
+            outputs = model(tokenized_input)
+            loss = outputs.loss
+            loss.backward()
diff --git a/tests/test_llama_patch.py b/tests/test_llama_patch.py
deleted file mode 100644
index b1cd3711e0..0000000000
--- a/tests/test_llama_patch.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2022 MosaicML LLM Foundry authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import pytest
-import torch
-import transformers
-from composer.utils import reproducibility
-from transformers.models.llama.modeling_llama import LlamaAttention
-
-from llmfoundry.models.layers.llama_attention_monkeypatch import (
-    llama_attention_patch_torch, llama_attention_patch_triton)
-
-
-@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
-@pytest.mark.parametrize('explicit_mask', [True, False])
-@pytest.mark.parametrize(
-    'model_name', ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-2-70b-hf'])
-@pytest.mark.gpu
-def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
-                           model_name: str):
-    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
-        pytest.skip(
-            'The CI cluster does not have access to the Llama models, so skip this test.'
-        )
-
-    original_forward = LlamaAttention.forward
-
-    device = 'cuda:0'
-    sequence_length = 4096
-    model_dim = 4096 if '7b' in model_name else 8192
-    batch_size = 2
-    if patch_fn_name == 'torch':
-        patch_fn = llama_attention_patch_torch
-        dtype = torch.float32
-        atol = 0.0
-        rtol = 0.0
-    elif patch_fn_name == 'triton':
-        # the huggingface implementation of llama performs the softmax in fp32
-        # this can result in fairly large differences for the triton implementation
-        # but the torch implementation produces the exact same output so we can confirm
-        # the implementation is correct
-        patch_fn = llama_attention_patch_triton
-        dtype = torch.bfloat16
-        atol = 1e-2
-        rtol = 1e-2
-    else:
-        raise ValueError(f'Unknown patch_fn_name: {patch_fn_name}')
-
-    llama_config = transformers.AutoConfig.from_pretrained(model_name,
-                                                           use_auth_token=True)
-
-    reproducibility.seed_all(42)
-    attention = LlamaAttention(config=llama_config,)
-    attention.to(dtype=dtype, device=device)
-
-    rng = torch.Generator(device=device).manual_seed(42)
-    hidden_states = torch.randn(batch_size,
-                                sequence_length,
-                                model_dim,
-                                generator=rng,
-                                dtype=dtype,
-                                device=device)
-    causal_mask = torch.full((sequence_length, sequence_length),
-                             torch.finfo(torch.float32).min,
-                             device=device)
-    causal_mask = causal_mask.triu(diagonal=1)
-    causal_mask = causal_mask[None,
-                              None, :, :].expand(batch_size, 1, sequence_length,
-                                                 sequence_length)
-    attn_output, _, _ = attention(
-        hidden_states=hidden_states,
-        attention_mask=causal_mask if explicit_mask else None,
-        position_ids=None,
-        past_key_value=None,
-        use_cache=False,
-    )
-
-    reproducibility.seed_all(42)
-    LlamaAttention.forward = patch_fn
-    attention = LlamaAttention(config=llama_config,)
-    attention.to(dtype=dtype, device=device)
-    new_output, _, _ = attention(
-        hidden_states=hidden_states,
-        attention_mask=causal_mask if explicit_mask else None,
-        position_ids=None,
-        past_key_value=None,
-        use_cache=False,
-    )
-
-    # Reset the forward function so patches don't persist
-    LlamaAttention.forward = original_forward
-
-    assert torch.allclose(attn_output, new_output, atol=atol, rtol=rtol)

From bc687b7599e7a54a2c259fb589bdc73d7335fc34 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 25 Oct 2023 11:45:17 -0700
Subject: [PATCH 03/49] Fix mlflow model logging bug (#692)

---
 llmfoundry/callbacks/hf_checkpointer.py | 29 ++++++++++++++++++++++---
 tests/test_hf_conversion_script.py      | 29 ++++++++++++++-----------
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index aa3beda513..3050529a5a 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -4,13 +4,14 @@
 import contextlib
 import copy
 import logging
+import math
 import os
 import tempfile
 from pathlib import Path
 from typing import Optional, Union
 
 import torch
-from composer.core import Callback, Event, State, Time
+from composer.core import Callback, Event, State, Time, TimeUnit
 from composer.core.state import fsdp_state_dict_type_context
 from composer.loggers import Logger, MLFlowLogger
 from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
@@ -83,6 +84,13 @@ def __init__(
 
         self.huggingface_folder_name_fstr = os.path.join(
             'huggingface', huggingface_folder_name)
+
+        if isinstance(save_interval, str):
+            save_interval = Time.from_timestring(save_interval)
+        if isinstance(save_interval, int):
+            save_interval = Time(save_interval, TimeUnit.EPOCH)
+
+        self.save_interval = save_interval
         self.check_interval = create_interval_scheduler(
             save_interval, include_end_of_training=True)
         self.upload_to_object_store = (self.backend != '')
@@ -128,6 +136,21 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
                 mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set(
                     '5GB')
 
+    def _is_last_batch(self, state: State):
+        elapsed_duration = state.get_elapsed_duration()
+        if elapsed_duration is not None and elapsed_duration >= 1.0:
+            return True
+
+        assert state.max_duration is not None  # for pyright
+        # If the save interval is specified as 1dur, and the max duration is in epoch units
+        # we need a special case to identify we are on the last batch and should write the mlflow checkpoint
+        if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
+            assert state.dataloader_len is not None  # for pyright
+            return int(state.timestamp.batch) % math.ceil(
+                state.max_duration.value * state.dataloader_len) == 0
+
+        return False
+
     def _save_checkpoint(self, state: State, logger: Logger):
         del logger  # unused
 
@@ -224,8 +247,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
                             overwrite=self.overwrite,
                         )
 
-                elapsed_duration = state.get_elapsed_duration()
-                if self.mlflow_registered_model_name is not None and elapsed_duration is not None and elapsed_duration >= 1.0:
+                if self.mlflow_registered_model_name and self._is_last_batch(
+                        state):
                     components = {'model': new_model_instance}
                     if original_tokenizer is not None:
                         components['tokenizer'] = original_tokenizer
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index fcb2cc3a7e..d2f203d3a0 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -251,25 +251,30 @@ def test_callback_inits_with_defaults():
 @pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
 @pytest.mark.parametrize('log_to_mlflow', [True, False])
+@pytest.mark.parametrize(
+    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
+    [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
 def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                                          fsdp_state_dict_type: Optional[str],
-                                         log_to_mlflow: bool):
+                                         log_to_mlflow: bool,
+                                         hf_save_interval: str,
+                                         save_interval: str, max_duration: str,
+                                         expected_hf_checkpoints: int,
+                                         expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
 
     max_seq_len = 16
-    save_interval_batches = 2
-    huggingface_save_interval_batches = 3
     device_batch_size = 1
     dataset_size = 14
-    max_duration_batches = 7
     precision_str = 'bfloat16'
     precision = torch.bfloat16
+    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
 
     checkpointer_callback = HuggingFaceCheckpointer(
         save_folder=os.path.join(tmp_path, 'checkpoints'),
-        save_interval=f'{huggingface_save_interval_batches}ba',
+        save_interval=hf_save_interval,
         precision=precision_str,
         mlflow_registered_model_name='dummy-registered-name'
         if log_to_mlflow else None,
@@ -405,8 +410,8 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
         fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
-        save_interval=f'{save_interval_batches}ba',
-        max_duration=f'{max_duration_batches}ba',
+        save_interval=save_interval,
+        max_duration=max_duration,
         callbacks=[checkpointer_callback],
         loggers=[mlflow_logger_mock] if log_to_mlflow else [],
         optimizers=optimizer,
@@ -442,15 +447,13 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                 name for name in os.listdir(
                     os.path.join(tmp_path, 'checkpoints', 'huggingface'))
             ]
-            assert len(normal_checkpoints) == math.ceil(max_duration_batches /
-                                                        save_interval_batches)
-            assert len(huggingface_checkpoints) == math.ceil(
-                max_duration_batches / huggingface_save_interval_batches)
+            assert len(normal_checkpoints) == expected_normal_checkpoints
+            assert len(huggingface_checkpoints) == expected_hf_checkpoints
 
             # Load the last huggingface checkpoint
             loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
                 os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{max_duration_batches}'),
+                             f'ba{batches_per_epoch}'),
                 trust_remote_code=True,
             )
 
@@ -471,7 +474,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
 
             loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
                 os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{max_duration_batches}'),
+                             f'ba{batches_per_epoch}'),
                 trust_remote_code=True,
             )
 

From ea3279ac06c1adbd1763e672221afe344be908c0 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Wed, 25 Oct 2023 15:33:52 -0700
Subject: [PATCH 04/49] Add fixtures (#673)

Add fixtures for testing boilerplate, tiny mpt models, and tiny finetune dataset
---
 tests/conftest.py                |  18 ++--
 tests/fixtures/__init__.py       |   2 +
 tests/fixtures/autouse.py        |  39 ++++++++
 tests/fixtures/data.py           |  58 +++++++++++
 tests/fixtures/models.py         |  70 +++++++++++++
 tests/test_data_prep_scripts.py  |  18 ++--
 tests/test_flash_triton_torch.py |   9 --
 tests/test_hf_config.py          |   3 -
 tests/test_hf_mpt_gen.py         | 162 +++++--------------------------
 tests/test_hf_v_mpt.py           |   9 --
 tests/test_init_fn.py            |   7 --
 tests/test_model.py              |  18 +---
 tests/test_mpt_gen.py            | 100 ++++++++++++-------
 tests/test_onnx.py               |   2 -
 14 files changed, 272 insertions(+), 243 deletions(-)
 create mode 100644 tests/fixtures/__init__.py
 create mode 100644 tests/fixtures/autouse.py
 create mode 100644 tests/fixtures/data.py
 create mode 100644 tests/fixtures/models.py

diff --git a/tests/conftest.py b/tests/conftest.py
index b39ebd66a9..545dc7e38f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,10 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import gc
 import os
 from typing import List, Optional
 
 import pytest
-import torch
 from composer.utils import reproducibility
 
 # Allowed options for pytest.mark.world_size()
@@ -18,6 +16,13 @@
 # Enforce deterministic mode before any tests start.
 reproducibility.configure_deterministic_mode()
 
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    'tests.fixtures.autouse',
+    'tests.fixtures.models',
+    'tests.fixtures.data',
+]
+
 
 def _add_option(parser: pytest.Parser,
                 name: str,
@@ -78,12 +83,3 @@ def pytest_collection_modifyitems(config: pytest.Config,
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
     if exitstatus == 5:
         session.exitstatus = 0  # Ignore no-test-ran errors
-
-
-@pytest.fixture(autouse=True)
-def clear_cuda_cache(request: pytest.FixtureRequest):
-    """Clear memory between GPU tests."""
-    marker = request.node.get_closest_marker('gpu')
-    if marker is not None and torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/fixtures/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
new file mode 100644
index 0000000000..c51ccfacb0
--- /dev/null
+++ b/tests/fixtures/autouse.py
@@ -0,0 +1,39 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import pytest
+import torch
+from composer.utils import dist, get_device, reproducibility
+
+
+@pytest.fixture(autouse=True)
+def initialize_dist(request: pytest.FixtureRequest):
+    """Initialize the default PyTorch distributed process group for tests."""
+    # should we just always initialize dist like in train.py?
+    _default = pytest.mark.world_size(1).mark
+    world_size = request.node.get_closest_marker('world_size', _default).args[0]
+    gpu = request.node.get_closest_marker('gpu')
+    if world_size > 1:
+        dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu'))
+
+
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+
+
+@pytest.fixture
+def random_seed() -> int:
+    return 17
+
+
+@pytest.fixture(autouse=True)
+def seed_all(random_seed: int):
+    """Sets the seed for reproducibility."""
+    reproducibility.seed_all(random_seed)
diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
new file mode 100644
index 0000000000..39032146b6
--- /dev/null
+++ b/tests/fixtures/data.py
@@ -0,0 +1,58 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+
+from composer.utils import dist
+from omegaconf import DictConfig
+from pytest import fixture
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from tests.data_utils import make_tiny_ft_dataset
+
+
+@fixture
+def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path:
+    """Creates a tiny dataset and returns the path."""
+    tiny_dataset_path = tmp_path / 'test-ift-data-small'
+    tiny_dataset_path.mkdir(exist_ok=True)
+    tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
+    if dist.get_world_size() == 1 or dist.get_global_rank() == 0:
+        make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
+    return tiny_dataset_path
+
+
+@fixture
+def tiny_ft_dataloader(tiny_ft_dataset_path: Path,
+                       mpt_tokenizer: PreTrainedTokenizerBase,
+                       max_seq_len: int = 128,
+                       device_batch_size: int = 1) -> DataLoader:
+    dataloader_cfg = DictConfig({
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': str(tiny_ft_dataset_path),
+            'split': 'train',
+            'max_seq_len': max_seq_len,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': None,
+            'shuffle': True,
+        },
+        'drop_last': False,
+        'num_workers': 4,
+        'pin_memory': False,
+        'prefetch_factor': 2,
+        'persistent_workers': False,
+        'timeout': 0
+    })
+
+    dataloader = build_finetuning_dataloader(
+        dataloader_cfg,
+        mpt_tokenizer,
+        device_batch_size,
+    ).dataloader
+
+    assert isinstance(dataloader, DataLoader)
+    return dataloader
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
new file mode 100644
index 0000000000..1b1ef86302
--- /dev/null
+++ b/tests/fixtures/models.py
@@ -0,0 +1,70 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable
+
+from omegaconf import DictConfig
+from pytest import fixture
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
+from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
+    model = COMPOSER_MODEL_REGISTRY[config.name](config, tokenizer)
+    return model
+
+
+@fixture
+def mpt_tokenizer():
+    return build_tokenizer('EleutherAI/gpt-neox-20b', {})
+
+
+@fixture
+def build_tiny_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerMPTCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerMPTCausalLM:
+        config = DictConfig({
+            'name': 'mpt_causal_lm',
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        })
+        config.update(kwargs)
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerMPTCausalLM)
+        return model
+
+    return build
+
+
+@fixture
+def build_tiny_hf_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerHFCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerHFCausalLM:
+        config_overrides = {
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        }
+        config_overrides.update(kwargs)
+        config = DictConfig({
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
+            'pretrained': False,
+            'config_overrides': config_overrides,
+        })
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerHFCausalLM)
+        return model
+
+    return build
diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py
index 4c555ea9a2..4fe5ed7e64 100644
--- a/tests/test_data_prep_scripts.py
+++ b/tests/test_data_prep_scripts.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import shutil
 import sys
 from argparse import Namespace
+from pathlib import Path
 
 # Add repo root to path so we can import scripts and test it
 repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -13,17 +13,16 @@
 from scripts.data_prep.convert_dataset_json import main as main_json
 
 
-def test_download_script_from_api():
+def test_download_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-c4-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-c4-1')
     main_hf(
         Namespace(
             **{
                 'dataset': 'c4',
                 'data_subset': 'en',
                 'splits': ['val_xsmall'],
-                'out_root': './my-copy-c4-1',
+                'out_root': path,
                 'compression': None,
                 'concat_tokens': None,
                 'bos_text': None,
@@ -32,18 +31,16 @@ def test_download_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
 
 
-def test_json_script_from_api():
+def test_json_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-arxiv-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
     main_json(
         Namespace(
             **{
                 'path': 'scripts/data_prep/example_data/arxiv.jsonl',
-                'out_root': './my-copy-arxiv-1',
+                'out_root': path,
                 'compression': None,
                 'split': 'train',
                 'concat_tokens': None,
@@ -53,4 +50,3 @@ def test_json_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index 145d4a5885..e6fe8eb438 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
 
@@ -39,8 +38,6 @@ def test_attn_impl(attn_impl_0: str,
     if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
         pytest.xfail('flash attn does not support alibi')
 
-    reproducibility.seed_all(7)
-
     cfg = om.create({
         'attn_impl': 'flash',
         'd_model': 128,
@@ -135,8 +132,6 @@ def test_vs_mha(attn_impl: str, device: str = 'cuda'):
     """Compare diff attn_impl to torch.nn.MultiheadAttention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -234,8 +229,6 @@ def test_grouped_attention_heads(attn_impl: str,
     """Ensure grouped_query_attention runs w/ diff n_heads & kv_n_heads."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -273,8 +266,6 @@ def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
     """Check indivisble combinations of grouped_query_attention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
diff --git a/tests/test_hf_config.py b/tests/test_hf_config.py
index 5b3bb3d150..b47f267c55 100644
--- a/tests/test_hf_config.py
+++ b/tests/test_hf_config.py
@@ -9,7 +9,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from transformers import AutoModelForCausalLM
@@ -93,8 +92,6 @@ def test_hf_config_override(
     with open(conf_path) as f:
         test_cfg = om.load(f)
 
-    reproducibility.seed_all(test_cfg.seed)
-
     # Build Model
     # For fast initialization, use `meta` device
     print('Initializing model...')
diff --git a/tests/test_hf_mpt_gen.py b/tests/test_hf_mpt_gen.py
index cc357141ba..ea133c64fa 100644
--- a/tests/test_hf_mpt_gen.py
+++ b/tests/test_hf_mpt_gen.py
@@ -1,167 +1,51 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from pathlib import Path
-from typing import Any, Dict
-from unittest.mock import Mock
+from typing import Callable
 
 import pytest
-from composer.callbacks import Generate as ComposerGenerate
 from composer.core.precision import get_precision_context
-from composer.trainer import Trainer
-from composer.utils import get_device, reproducibility
-from omegaconf import DictConfig
-from omegaconf import OmegaConf as om
+from composer.utils import get_device
+from transformers import PreTrainedTokenizerBase
 
-from llmfoundry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.data.finetuning import build_finetuning_dataloader
-from llmfoundry.utils import build_tokenizer
-from tests.data_utils import make_tiny_ft_dataset
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
 
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('device', ['cpu', 'gpu'])
 @pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
-def test_init_hfhub_mpt(device: str, attn_impl: str):
+def test_init_hfhub_mpt(
+    device: str,
+    attn_impl: str,
+    build_tiny_hf_mpt: Callable[..., ComposerHFCausalLM],
+    mpt_tokenizer: PreTrainedTokenizerBase,
+):
     if device == 'cpu' and attn_impl == 'triton':
         pytest.skip(f'{attn_impl=} not implemented for {device=}.')
     composer_device = get_device(device)
 
-    with open('scripts/train/yamls/pretrain/testing.yaml') as f:
-        test_cfg = om.load(f)
-
-    assert isinstance(test_cfg, DictConfig)
-    reproducibility.seed_all(test_cfg.get('seed', 42))
-
-    attn_uses_sequence_id = True if test_cfg.get('eos_token_id',
-                                                 None) is not None else False
-    test_cfg.model = DictConfig({
-        'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
-        'pretrained': False,
-        'config_overrides': {
-            'd_model': 128,
-            'n_heads': 4,
-            'n_layers': 2,
-            'expansion_ratio': 2,
-            'attn_config': {
-                'attn_impl': attn_impl,
-                'attn_uses_sequence_id': attn_uses_sequence_id,
-            },
-        },
+    model = build_tiny_hf_mpt(attn_config={
+        'attn_impl': attn_impl,
+        'attn_uses_sequence_id': False,
     })
-
-    # build tokenizer
-    tokenizer_cfg: Dict[str,
-                        Any] = om.to_container(test_cfg.tokenizer,
-                                               resolve=True)  # type: ignore
-    tokenizer_name = tokenizer_cfg['name']
-    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
-    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
-
-    # build model
-    model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model,
-                                                         tokenizer)
-    test_cfg.n_params = sum(p.numel() for p in model.parameters())
+    model = composer_device.module_to_device(model)
 
     model.eval()
-    model = composer_device.module_to_device(model)
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
         _ = model.generate(
             composer_device.tensor_to_device(
-                tokenizer('hello', return_tensors='pt')['input_ids']),
+                mpt_tokenizer('hello', return_tensors='pt')['input_ids']),
             max_new_tokens=10,
         )
 
 
-def test_init_hfhub_mpt_cpu():
-    test_init_hfhub_mpt(device='cpu', attn_impl='torch')
-
-
-@pytest.mark.gpu
-def test_mpt_generate_callback(tmpdir: Path):
-    composer_device = get_device('gpu')
-    reproducibility.seed_all(42)
-    max_seq_len = 128
-
-    # testing dataset and dataloader
-    dataset_size = 5
-
-    tiny_dataset_path = tmpdir / 'test-ift-data-small'
-    tiny_dataset_path.mkdir()
-    tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
-    make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
-
-    dataloader_cfg = DictConfig({
-        'name': 'finetuning',
-        'dataset': {
-            'hf_name': str(tiny_dataset_path),
-            'split': 'train',
-            'max_seq_len': max_seq_len,
-            'decoder_only_format': True,
-            'allow_pad_trimming': False,
-            'packing_ratio': None,
-            'shuffle': True,
-        },
-        'drop_last': False,
-        'num_workers': 4,
-        'pin_memory': False,
-        'prefetch_factor': 2,
-        'persistent_workers': False,
-        'timeout': 0
-    })
-
-    # build tokenizer
-    tokenizer = build_tokenizer('EleutherAI/gpt-neox-20b', {})
-
-    # build mpt model
-    model_config = DictConfig({
-        'name': 'mpt_causal_lm',
-        'config_overrides': {
-            'd_model': 128,
-            'n_heads': 4,
-            'n_layers': 2,
-            'expansion_ratio': 2,
-        },
-    })
-    model = COMPOSER_MODEL_REGISTRY[model_config.name](model_config, tokenizer)
-    model = composer_device.module_to_device(model)
-
-    # generate callback
-    prompts = [
-        'The best banana bread recipe is',
-        '2+2=',
-        'how much wood could a woodchuck chuck',
-    ]
-    gen_interval = 1
-    generate = ComposerGenerate(
-        prompts,
-        interval=f'{gen_interval}ba',
-        max_new_tokens=5,
-        batch_size=len(prompts),
-        use_cache=True,
-    )
-    generate.generate = Mock(wraps=generate.generate, autospec=True)
-
-    # build trainer
-    device_batch_size = 1
-    train_dataloader = build_finetuning_dataloader(
-        dataloader_cfg,
-        tokenizer,
-        device_batch_size,
-    )
-
-    trainer = Trainer(
-        model=model,
-        train_dataloader=train_dataloader,
-        device=composer_device,
-        max_duration=f'{gen_interval}ba',
-        callbacks=[generate],
-    )
-    trainer.logger.log_table = Mock()
-    trainer.fit()
-
-    generate.generate.assert_called_once()
-    trainer.logger.log_table.assert_called_once()
+def test_init_hfhub_mpt_cpu(
+    build_tiny_hf_mpt: Callable[..., ComposerHFCausalLM],
+    mpt_tokenizer: PreTrainedTokenizerBase,
+):
+    test_init_hfhub_mpt(device='cpu',
+                        attn_impl='torch',
+                        build_tiny_hf_mpt=build_tiny_hf_mpt,
+                        mpt_tokenizer=mpt_tokenizer)
diff --git a/tests/test_hf_v_mpt.py b/tests/test_hf_v_mpt.py
index 82e2d05550..46172faf35 100644
--- a/tests/test_hf_v_mpt.py
+++ b/tests/test_hf_v_mpt.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
@@ -52,10 +51,6 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     batch_size = 2  # set batch size
     device = 'cuda'  # set decive
 
-    # ensure reproducibility
-    seed = 17
-    reproducibility.seed_all(seed)  # set seed
-
     # get hf gpt2 cfg
     hf_cfg = om.create({
         'model': {
@@ -154,11 +149,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
 
     # UTIL: can be used to verify that models are not the same at init
     with torch.autocast(device_type='cuda', dtype=torch.float16):
-        torch.manual_seed(seed)
         hf_model_fwd = hf_model(batch)['logits']
         if kpm is not None:
             hf_model_fwd *= kpm
-        torch.manual_seed(seed)
         model_fwd = model(batch).logits
         if kpm is not None:
             model_fwd *= kpm
@@ -208,11 +201,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     model.load_state_dict(_hf_model_statedict)
 
     with torch.autocast(device_type=device, dtype=torch.float16):
-        torch.manual_seed(seed)
         hf_model_fwd = hf_model(batch)['logits']
         if kpm is not None:
             hf_model_fwd *= kpm
-        torch.manual_seed(seed)
         model_fwd = model(batch).logits
         if kpm is not None:
             model_fwd *= kpm
diff --git a/tests/test_init_fn.py b/tests/test_init_fn.py
index b054bac186..6be2c5ca42 100644
--- a/tests/test_init_fn.py
+++ b/tests/test_init_fn.py
@@ -8,7 +8,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from torch import nn
@@ -35,8 +34,6 @@ def forward(self, x: torch.Tensor):
 
 @pytest.mark.parametrize('is_residual', [True, False])
 def test_div_is_residual(is_residual: bool):
-    reproducibility.seed_all(7)
-
     in_features, out_features = 8, 32
     cfg = om.create({
         'in_features': in_features,
@@ -64,8 +61,6 @@ def test_div_is_residual(is_residual: bool):
 
 @pytest.mark.parametrize('fused', [True, False])
 def test_fused_init_helper(fused: bool):
-    reproducibility.seed_all(7)
-
     in_features, out_features = 8, 32
     cfg = om.create({
         'in_features': in_features,
@@ -133,8 +128,6 @@ def max_fill_init_(weight: torch.Tensor):
     ('emb_init_uniform_lim', [1, 1])
 ])
 def test_emb_init(emb_init_cfg: Optional[Tuple[str, Union[int, List[int]]]]):
-    reproducibility.seed_all(7)
-
     cfg: Dict[str, Union[int, List[int]]] = {
         'vocab_size': 64,
         'in_features': 16,
diff --git a/tests/test_model.py b/tests/test_model.py
index 6ea530731a..67166bef68 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -16,7 +16,7 @@
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import dist, get_device
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
@@ -56,8 +56,6 @@ def get_objs(conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml'):
         message='Torchmetrics v0.9 introduced a new argument class property')
     test_cfg = get_config(conf_path=conf_path)
 
-    reproducibility.seed_all(test_cfg.seed)
-
     # Read FSDP Config as a dict
     fsdp_config = test_cfg.get('fsdp_config', None)
     fsdp_config = om.to_container(fsdp_config,
@@ -316,7 +314,6 @@ def test_determinism(attn_impl: str, precision: torch.dtype):
         pytest.skip(
             'This test requires CUDA to be available in order to run with bfloat16 precision.'
         )
-    reproducibility.seed_all(1111)
 
     conf_path = 'scripts/train/yamls/pretrain/testing.yaml'
     with open(conf_path) as f:
@@ -394,8 +391,6 @@ def test_loss_fn():
         'init_std': 0.02,
     }
 
-    reproducibility.seed_all(test_cfg.get('global_seed', 42))
-
     tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer)
     tokenizer = build_tokenizer(test_cfg.tokenizer.name,
                                 tokenizer_cfg.get('kwargs', {}))
@@ -537,7 +532,6 @@ def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    reproducibility.seed_all(1234)
     composer_device = get_device(device)
 
     hf_config = MPTConfig(
@@ -716,7 +710,6 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    reproducibility.seed_all(1234)
     composer_device = get_device(device)
 
     hf_config = MPTConfig(
@@ -776,14 +769,12 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
                                           use_cache=False)
         assert batched_generation.shape == (2, 6 + 5)
 
-        reproducibility.seed_all(1234)
         generation_with_left_padding = mpt.generate(
             input_ids=left_padding_input_ids,
             attention_mask=left_padding_attention_mask,
             max_new_tokens=5,
             use_cache=False)
         assert generation_with_left_padding.shape == (2, 6 + 5)
-        reproducibility.seed_all(1234)
         generation_with_no_padding = mpt.generate(
             input_ids=no_padding_input_ids,
             attention_mask=no_padding_attention_mask,
@@ -1007,14 +998,12 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
             'init_std': 0.02,
         },
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
-        reproducibility.seed_all(1234)
         first_input_ids = torch.tensor([[11274, 16390, 11]])
         first_input_ids = composer_device.tensor_to_device(first_input_ids)
         first_attention_mask = torch.tensor([[1, 1, 1]]).bool()
@@ -1040,7 +1029,6 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
             assert all(past_key_value[1].shape == (1, 3, 128)
                        for past_key_value in first_output.past_key_values)
 
-        reproducibility.seed_all(1234)
         second_input_ids = torch.tensor([[11274, 16390, 11, 11274]])
         second_input_ids = composer_device.tensor_to_device(second_input_ids)
         second_attention_mask = torch.tensor([[1, 1, 1, 1]]).bool()
@@ -1070,7 +1058,6 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
             assert all(past_key_value[1].shape == (1, 4, 128)
                        for past_key_value in second_output.past_key_values)
 
-        reproducibility.seed_all(1234)
         # pass through the first four tokens without the key-value cache
         full_output = mpt(second_input_ids,
                           attention_mask=second_attention_mask)
@@ -1205,7 +1192,6 @@ def test_model_to(attention_impl: str, alibi: bool):
             'init_std': 0.02,
         },
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = mpt.bfloat16()
     mpt = mpt.to('cuda')
@@ -1318,14 +1304,12 @@ def test_forward_with_output_attentions_and_output_hidden_states(
             'init_std': 0.02,
         },
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
-        reproducibility.seed_all(1234)
         input_ids = torch.tensor([[11274, 16390, 11]])
         input_ids = composer_device.tensor_to_device(input_ids)
         attention_mask = torch.tensor([[1, 1, 1]]).bool()
diff --git a/tests/test_mpt_gen.py b/tests/test_mpt_gen.py
index 06ddccd479..c52b765480 100644
--- a/tests/test_mpt_gen.py
+++ b/tests/test_mpt_gen.py
@@ -1,19 +1,21 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
-from unittest.mock import patch
+from typing import Callable, List, Optional, Tuple
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
+from composer import Trainer
+from composer.callbacks import Generate as ComposerGenerate
 from composer.core.precision import get_precision_context
-from composer.utils import dist, get_device, reproducibility
-from omegaconf import DictConfig
+from composer.utils import dist, get_device
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
 
-from llmfoundry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.models.mpt.modeling_mpt import MPTForCausalLM
-from llmfoundry.utils import build_tokenizer
+from llmfoundry.models.mpt.modeling_mpt import (ComposerMPTCausalLM,
+                                                MPTForCausalLM)
 
 EOS_TOKEN_ID = 0
 
@@ -55,44 +57,72 @@ def forward(
 @pytest.mark.parametrize('use_alibi', [True, False])
 @patch('llmfoundry.models.mpt.modeling_mpt.MPTForCausalLM',
        new=MockMPTForCausalLM)
-def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool):
+def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
+                                build_tiny_mpt: Callable[...,
+                                                         ComposerMPTCausalLM],
+                                mpt_tokenizer: PreTrainedTokenizerBase):
     """Tests mpt generation with mutiple gpus.
 
     and generations of different lengths.
     """
-    composer_device = get_device('gpu')
-    dist.initialize_dist(composer_device)
-    reproducibility.seed_all(42)
-
-    model_config = DictConfig({
-        'name': 'mpt_causal_lm',
-        'd_model': 128,
-        'n_heads': 4,
-        'n_layers': 2,
-        'expansion_ratio': 2,
-        'no_bias': False,
-        'use_cache': True,
-        'attn_config': {
-            'attn_impl': attn_impl,
-            'attn_uses_sequence_id': False,
-            'alibi': use_alibi
-        },
-    })
-
-    # build tokenizer
-    tokenizer = build_tokenizer('EleutherAI/gpt-neox-20b', {})
-
-    # build model
-    model = COMPOSER_MODEL_REGISTRY[model_config.name](model_config, tokenizer)
-    model = composer_device.module_to_device(model)
+    device = get_device('gpu')
+
+    model = build_tiny_mpt(attn_config={
+        'attn_impl': attn_impl,
+        'attn_uses_sequence_id': False,
+        'alibi': use_alibi
+    },)
+    model = device.module_to_device(model)
+
     model.eval()
 
     model.model = FSDP(model.model)
 
     with get_precision_context('amp_bf16'):
-        _ = model.generate(composer_device.tensor_to_device(
-            tokenizer('hello', return_tensors='pt')['input_ids']),
+        _ = model.generate(device.tensor_to_device(
+            mpt_tokenizer('hello', return_tensors='pt')['input_ids']),
                            max_new_tokens=3,
                            eos_token_id=EOS_TOKEN_ID,
                            use_cache=True,
                            synced_gpus=True)
+
+
+@pytest.mark.gpu
+def test_mpt_generate_callback(build_tiny_mpt: Callable[...,
+                                                        ComposerMPTCausalLM],
+                               tiny_ft_dataloader: DataLoader):
+    device = get_device('gpu')
+
+    # build mpt model
+    model = build_tiny_mpt()
+    model = device.module_to_device(model)
+
+    # generate callback
+    prompts = [
+        'The best banana bread recipe is',
+        '2+2=',
+        'how much wood could a woodchuck chuck',
+    ]
+    gen_interval = 1
+    generate = ComposerGenerate(
+        prompts,
+        interval=f'{gen_interval}ba',
+        max_new_tokens=5,
+        batch_size=len(prompts),
+        use_cache=True,
+    )
+    generate.generate = Mock(wraps=generate.generate, autospec=True)
+
+    # build trainer
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        device=device,
+        max_duration=f'{gen_interval}ba',
+        callbacks=[generate],
+    )
+    trainer.logger.log_table = Mock()
+    trainer.fit()
+
+    generate.generate.assert_called_once()
+    trainer.logger.log_table.assert_called_once()
diff --git a/tests/test_onnx.py b/tests/test_onnx.py
index 4ccb8e4112..d0e01746eb 100644
--- a/tests/test_onnx.py
+++ b/tests/test_onnx.py
@@ -4,7 +4,6 @@
 import pathlib
 
 import torch
-from composer.utils import reproducibility
 from transformers import AutoModelForCausalLM
 
 from llmfoundry import MPTConfig, MPTForCausalLM
@@ -27,7 +26,6 @@ def gen_random_batch(batch_size: int, vocab_size: int, max_seq_len: int):
 
 
 def test_onnx_export(tmp_path: pathlib.Path):
-    reproducibility.seed_all(42)
     from transformers.models.auto.configuration_auto import CONFIG_MAPPING
     CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
     AutoModelForCausalLM.register(MPTConfig, MPTForCausalLM)

From 08611b0d31eb3a63531aff9630bbecf55b4b08de Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Wed, 25 Oct 2023 19:21:15 -0700
Subject: [PATCH 05/49] Make default for cuda_load_lazy false (#694)

---
 scripts/train/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/train/train.py b/scripts/train/train.py
index 8c1c28eb5c..e29f2c9a47 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -219,7 +219,7 @@ def main(cfg: DictConfig) -> Trainer:
 
     # Set CUDA lazy loading
     # This can save a bit of memory if not all modules are needed
-    cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', True)
+    cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False)
     if cuda_load_lazy:
         os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
 

From c60657b2093ae4d1ead775138035ed2448ed1c96 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Wed, 25 Oct 2023 20:57:49 -0700
Subject: [PATCH 06/49] Update README.md (#693)

```
________________________________ Traceback (most recent call last) _________________________________
_ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:604 in <module>                   _
_                                                                                                  _
_   601 _   cfg = om.merge(yaml_cfg, cli_cfg)                                                      _
_   602 _   om.resolve(cfg)                                                                        _
_   603 _   assert isinstance(cfg, DictConfig)                                                     _
_ _ 604 _   main(cfg)                                                                              _
_   605                                                                                            _
_                                                                                                  _
_ /mnt/workdisk/brian/llm-foundry-private/scripts/train/train.py:222 in main                       _
_                                                                                                  _
_   219 _   _   _   _   _   _   _   _   _   _   _   _    'dist_timeout',                           _
_   220 _   _   _   _   _   _   _   _   _   _   _   _    must_exist=False,                         _
_   221 _   _   _   _   _   _   _   _   _   _   _   _    default_value=600.0)                      _
_ _ 222 _   dist.initialize_dist(get_device(None), timeout=dist_timeout)                           _
_   223 _                                                                                          _
_   224 _   # Get global and device batch size information from distributed/single node setting    _
_   225 _   cfg = update_batch_size_info(cfg)                                                      _
_                                                                                                  _
_ /mnt/workdisk/brian/composer/composer/utils/dist.py:527 in initialize_dist                       _
_                                                                                                  _
_   524 _   _   os.environ.update(dist_env_var_defaults)                                           _
_   525 _   _   dist.init_process_group(device_obj.dist_backend, store=dist.HashStore(), world_s   _
_   526 _   else:                                                                                  _
_ _ 527 _   _   dist.init_process_group(device_obj.dist_backend, timeout=timeout_timedelta)        _
_   528                                                                                            _
_   529                                                                                            _
_   530 def get_sampler(dataset: torch.utils.data.Dataset, *, drop_last: bool = False, shuffle:    _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/c10d_logger.py _
_ :74 in wrapper                                                                                   _
_                                                                                                  _
_   71 _   @functools.wraps(func)                                                                  _
_   72 _   def wrapper(*args, **kwargs):                                                           _
_   73 _   _   t1 = time.time_ns()                                                                 _
_ _ 74 _   _   func_return = func(*args, **kwargs)                                                 _
_   75 _   _   t2 = time.time_ns()                                                                 _
_   76 _   _                                                                                       _
_   77 _   _   if dist.is_initialized():                                                           _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/distributed_c1 _
_ 0d.py:1141 in init_process_group                                                                 _
_                                                                                                  _
_   1138 _   _   _   rendezvous_iterator = rendezvous(                                             _
_   1139 _   _   _   _   init_method, rank, world_size, timeout=timeout                            _
_   1140 _   _   _   )                                                                             _
_ _ 1141 _   _   _   store, rank, world_size = next(rendezvous_iterator)                           _
_   1142 _   _   _   store.set_timeout(timeout)                                                    _
_   1143 _   _   _                                                                                 _
_   1144 _   _   _   # Use a PrefixStore to avoid accidental overrides of keys used by             _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _
_ 231 in _env_rendezvous_handler                                                                   _
_                                                                                                  _
_   228 _   if "rank" in query_dict:                                                               _
_   229 _   _   rank = int(query_dict["rank"])                                                     _
_   230 _   else:                                                                                  _
_ _ 231 _   _   rank = int(_get_env_or_raise("RANK"))                                              _
_   232 _                                                                                          _
_   233 _   if "world_size" in query_dict:                                                         _
_   234 _   _   world_size = int(query_dict["world_size"])                                         _
_                                                                                                  _
_ /mnt/workdisk/brian/mpt_checkpoint/lib/python3.10/site-packages/torch/distributed/rendezvous.py: _
_ 216 in _get_env_or_raise                                                                         _
_                                                                                                  _
_   213 _   def _get_env_or_raise(env_var: str) -> str:                                            _
_   214 _   _   env_val = os.environ.get(env_var, None)                                            _
_   215 _   _   if not env_val:                                                                    _
_ _ 216 _   _   _   raise _env_error(env_var)                                                      _
_   217 _   _   else:                                                                              _
_   218 _   _   _   return env_val                                                                 _
_   219
```
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 04bad9c519..1d3f6d5df4 100644
--- a/README.md
+++ b/README.md
@@ -228,7 +228,7 @@ python inference/convert_composer_to_hf.py \
   # --hf_repo_for_upload user-org/repo-name
 
 # Evaluate the model on a subset of tasks
-python eval/eval.py \
+composer eval/eval.py \
   eval/yamls/hf_eval.yaml \
   icl_tasks=eval/yamls/copa.yaml \
   model_name_or_path=mpt-125m-hf

From 6f5973849a4c5a6d7e5ca30040e60435aa62061d Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 26 Oct 2023 15:25:10 -0700
Subject: [PATCH 07/49] Pad tiktoken vocab so that additional_special_tokens
 works (#695)

---
 llmfoundry/tokenizers/tiktoken.py | 33 ++++++++++++++++++++++++--
 tests/test_tiktoken.py            | 39 ++++++++++++++++++++++++++-----
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 45192e09dd..10a296497a 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -26,7 +27,7 @@ def __init__(self,
                  eos_token: Optional[str] = '<|endoftext|>',
                  bos_token: Optional[str] = '<|endoftext|>',
                  pad_token: Optional[str] = None,
-                 **kwargs: Dict[str, Any]):
+                 **kwargs: Any):
         """Constructor creates a tiktoken tokenizer to use as the underlying.
 
         tokenizer.
@@ -90,7 +91,17 @@ def is_fast(self) -> bool:
         return False
 
     def get_vocab(self) -> Dict[str, int]:
-        """Returns vocab as a dict."""
+        """Returns vocab as a dict.
+
+        Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
+        Most uses do not need to use get_vocab, so this is not a priority to fix.
+        """
+        warnings.warn(
+            'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
+            +
+            ' It will be called once init just to get the size of the vocab inside the base class.'
+        )
+
         vocab = {}
         for i in range(self.vocab_size):
             try:
@@ -101,6 +112,24 @@ def get_vocab(self) -> Dict[str, int]:
             except KeyError:
                 pass
 
+        # As far as I can tell, we don't require get_vocab to completely work,
+        # but when using additional_special_tokens, Hugging Face determines the next
+        # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
+        extra_id_index = 0
+        candidate_extra_id = f'<extra_id_{extra_id_index}>'
+        indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
+            vocab.values())
+
+        # Add enough indices to make get_vocab() the right length
+        for index_to_add in indices_to_fill_in:
+            # Make sure we don't overwrite a token that already exists
+            while candidate_extra_id in vocab:
+                extra_id_index += 1
+                candidate_extra_id = f'<extra_id_{extra_id_index}>'
+
+            # Get an index to add and add the item
+            vocab[candidate_extra_id] = index_to_add
+
         return vocab
 
     def _tokenize(self, text: str) -> List[int]:
diff --git a/tests/test_tiktoken.py b/tests/test_tiktoken.py
index 85ff18100b..d1568e6d2a 100644
--- a/tests/test_tiktoken.py
+++ b/tests/test_tiktoken.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pathlib
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 
 import pytest
 import transformers
@@ -49,15 +49,18 @@ def get_tokenizers_for_testing(
     encoding_name: Optional[str],
     tmp_path: pathlib.Path,
     add_bos_token: bool = False,
-    add_eos_token: bool = False
+    add_eos_token: bool = False,
+    additional_special_tokens: Optional[List[str]] = None,
 ) -> Tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']:
     tiktoken = pytest.importorskip('tiktoken')
 
     # Construction
-    wrapped_tokenizer = TiktokenTokenizerWrapper(model_name=model_name,
-                                                 encoding_name=encoding_name,
-                                                 add_bos_token=add_bos_token,
-                                                 add_eos_token=add_eos_token)
+    wrapped_tokenizer = TiktokenTokenizerWrapper(
+        model_name=model_name,
+        encoding_name=encoding_name,
+        add_bos_token=add_bos_token,
+        add_eos_token=add_eos_token,
+        additional_special_tokens=additional_special_tokens)
     if model_name is not None:
         original_tokenizer = tiktoken.encoding_for_model(model_name)
     else:
@@ -176,6 +179,10 @@ def test_tiktoken_vocab(model_name: Optional[str], encoding_name: Optional[str],
 
     didnt_match = []
     for key, value in wrapped_vocab.items():
+        # Skip checking the extra ids we pad the vocab with
+        if key.startswith('<extra_id') and key.endswith('>'):
+            continue
+
         if original_tokenizer.encode(key, allowed_special='all') == [value]:
             continue
         else:
@@ -232,3 +239,23 @@ def test_tiktoken_encode_plus(model_name: Optional[str],
         encoded_special_mask = encoded_outputs.special_tokens_mask
         assert encoded_special_mask[0] == 1
         assert encoded_special_mask[-1] == 1
+
+
+@pytest.mark.parametrize('model_name,encoding_name',
+                         MODEL_ENCODING_NAME_PARAMETRIZATION)
+def test_additional_special_tokens(model_name: Optional[str],
+                                   encoding_name: Optional[str],
+                                   tmp_path: pathlib.Path):
+    special_token_to_add = '<|im_start|>'
+    wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
+        model_name,
+        encoding_name,
+        tmp_path,
+        add_bos_token=False,
+        add_eos_token=False,
+        additional_special_tokens=[special_token_to_add])
+    encoded_outputs = wrapped_tokenizer(special_token_to_add +
+                                        ' hello')['input_ids']
+
+    assert encoded_outputs[0] == wrapped_tokenizer.vocab_size
+    assert len(encoded_outputs) == 2

From 7009d4d1a5dd9191874579d422d312a52b5fd060 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 26 Oct 2023 16:45:05 -0700
Subject: [PATCH 08/49] remove logs (#698)

---
 pyproject.toml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a2fcec3eed..0b078120b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,13 +86,6 @@ filterwarnings = [
     'ignore::DeprecationWarning:tensorboard',  # ignore tensorboard
 ]
 
-# Enable logging for pytest
-log_cli = true
-log_cli_level = "INFO"
-log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
-log_cli_date_format = "%Y-%m-%d %H:%M:%S"
-
-
 # Yapf
 [tool.yapf]
 # Align closing bracket with visual indentation.

From 9027f49153d89e6b0b225af3626311a9b4658dbf Mon Sep 17 00:00:00 2001
From: Jeremy D <115047575+bmosaicml@users.noreply.github.com>
Date: Fri, 27 Oct 2023 13:45:13 -0400
Subject: [PATCH 09/49] Change gauntlet avging (#640)

* commit

* commit

* commit

* commit

* commit

* restore mcli

* eval gauntlet cb

* fix error

* address daniels comments

* parametrize

* parametrize

* precommit

* change

* change

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../callbacks/eval_gauntlet_callback.py       | 88 +++++++++++++------
 scripts/eval/eval.py                          | 19 ++--
 scripts/eval/yamls/eval_gauntlet.yaml         | 21 +++++
 scripts/eval/yamls/hf_eval.yaml               |  2 +-
 tests/test_eval.py                            |  4 +-
 tests/test_eval_gauntlet.py                   | 17 +++-
 tests/test_training.py                        | 29 ++++--
 7 files changed, 129 insertions(+), 51 deletions(-)

diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py
index 78ccbb529b..7281a8d1fc 100644
--- a/llmfoundry/callbacks/eval_gauntlet_callback.py
+++ b/llmfoundry/callbacks/eval_gauntlet_callback.py
@@ -22,6 +22,32 @@ class Weighting(Enum):
     LOG_SAMPLE_SZ = 3
 
 
+def calculate_named_averages(average_names: Dict[str, list],
+                             category_scores: Dict[str, float]):
+    """Calculates the named averages based off the raw category scores.
+
+    For each named average, take a simple average of all the category scores associated with that named average.
+
+    Args:
+        average_names (dict[str, list]):  Contains a mapping of named averages to which category scores that average should consist of.
+        category_scores (dict[str, float]): Contains the raw scores corresponding to each category.
+    """
+    average_scores = {}
+    for avg_name, category_list in average_names.items():
+        composite_subset = {
+            category: score
+            for category, score in category_scores.items()
+            if category in category_list
+        }
+        if len(composite_subset.values()) > 0:
+            average_scores[avg_name] = sum(composite_subset.values()) / len(
+                composite_subset.values())
+        else:
+            average_scores[avg_name] = 0
+
+    return average_scores
+
+
 class EvalGauntlet(Callback):
     """The EvalGauntlet aggregates ICL eval results.
 
@@ -31,7 +57,7 @@ class EvalGauntlet(Callback):
     Args:
         logger_keys (list): These are the exact keys that the individual benchmark metrics will be
                             logged under in the logger after eval
-        tasks (dict): This contains the list of categories, as well as the subtasks within them, the
+        categories (dict): This contains the list of categories, as well as the subtasks within them, the
                       random baseline accuracy of each subtask, and the number of fewshot examples
                       used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
         weighting (Weighting): The weighting scheme used to balance different tasks within each category.
@@ -43,6 +69,7 @@ class EvalGauntlet(Callback):
         rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
                                  by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
         benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
+        averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average.
     """
 
     def __init__(self,
@@ -51,7 +78,8 @@ def __init__(self,
                  weighting: str = 'EQUAL',
                  subtract_random_baseline: bool = True,
                  rescale_accuracy: bool = True,
-                 benchmark_sizes: Optional[dict] = None):
+                 benchmark_sizes: Optional[dict] = None,
+                 averages: Optional[dict] = None):
         if isinstance(logger_keys, dict):
             raise ValueError(
                 'logger_keys now requires a list type as input, not a dict')
@@ -66,13 +94,12 @@ def __init__(self,
             )
 
         self.categories = categories
+        self.category_names = [conf.get('name') for conf in self.categories]
         self.weighting = Weighting[weighting]
         self.subtract_random_baseline = subtract_random_baseline
         self.rescale_accuracy = rescale_accuracy
         self.logger_keys = logger_keys
-
         for category in self.categories:
-
             for benchmark in category['benchmarks']:
                 bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
 
@@ -95,7 +122,20 @@ def __init__(self,
                 assert weight is not None
                 benchmark['weighting'] = weight
 
-    def compute_averages(self, state: State) -> Dict[str, float]:
+        self.averages = {}
+        if averages is not None:
+            self.averages = averages
+        else:
+            # if no averages spec provided, simply average everything
+            self.averages['default_average'] = self.category_names
+
+        for avg_name in self.averages:
+            if avg_name in self.category_names:
+                raise ValueError(
+                    f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.'
+                )
+
+    def extract_metrics_from_state(self, state: State) -> Dict[str, float]:
         results = {}
 
         for key in self.logger_keys:
@@ -121,23 +161,22 @@ def compute_averages(self, state: State) -> Dict[str, float]:
         return {k: sum(v) / len(v) for k, v in results.items()}
 
     def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
-        new_metrics = self.compute_averages(state)
-        if len(new_metrics) == 0:
+        computed_metrics = self.extract_metrics_from_state(state)
+        if len(computed_metrics) == 0:
             return {}
-        composite_scores = {}
-
+        category_scores = {}
         for category in self.categories:
             missing_metrics = []
-            composite_scores[category['name']] = []
+            category_scores[category['name']] = []
             for benchmark in category['benchmarks']:
                 key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
 
-                if key not in new_metrics:
+                if key not in computed_metrics:
                     log.warning(
                         f'Could not find results for benchmark: {benchmark}.')
                     missing_metrics.append(key)
                 else:
-                    score = new_metrics[key]
+                    score = computed_metrics[key]
 
                     if self.subtract_random_baseline:
                         score -= benchmark['random_baseline']
@@ -145,7 +184,7 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
                     if self.rescale_accuracy and self.subtract_random_baseline:
                         score /= 1.0 - benchmark['random_baseline']
 
-                    composite_scores[category['name']].append({
+                    category_scores[category['name']].append({
                         'name': benchmark['name'],
                         'score': score,
                         'weighting': benchmark['weighting']
@@ -155,23 +194,22 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
                 log.warning(
                     f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}"
                 )
-                del composite_scores[category['name']]
+                del category_scores[category['name']]
                 continue
             total_weight = sum(
-                k['weighting'] for k in composite_scores[category['name']])
-            composite_scores[category['name']] = sum(
+                k['weighting'] for k in category_scores[category['name']])
+            category_scores[category['name']] = sum(
                 k['score'] * (k['weighting'] / total_weight)
-                for k in composite_scores[category['name']])
+                for k in category_scores[category['name']])
 
-        composite_scores = {
+        named_averages = calculate_named_averages(self.averages,
+                                                  category_scores)
+        category_scores.update(named_averages)
+        category_scores = {
             f'icl/metrics/eval_gauntlet/{k}': v
-            for k, v in composite_scores.items()
+            for k, v in category_scores.items()
         }
-
-        composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
-            composite_scores.values()) / len(composite_scores.values()) if len(
-                composite_scores.values()) > 0 else 0
         if logger is not None:
-            logger.log_metrics(composite_scores)
+            logger.log_metrics(category_scores)
 
-        return composite_scores
+        return category_scores
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index f07942ba10..02a5d1f862 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -145,7 +145,8 @@ def evaluate_model(
 
     if eval_gauntlet_df is None and eval_gauntlet_callback is not None:
         eval_gauntlet_df = pd.DataFrame(
-            columns=['model_name', 'average'] +
+            columns=['model_name'] +
+            [avg for avg in eval_gauntlet_callback.averages] +
             [t.name for t in eval_gauntlet_callback.categories])
 
     load_path = model_cfg.get('load_path', None)
@@ -314,23 +315,17 @@ def main(cfg: DictConfig):
         if eval_gauntlet_df is not None and eval_gauntlet_callback is not None:
             assert composite_scores is not None
             row = {'model_name': model_cfg['model_name']}
-            row.update({
-                t.name:
-                composite_scores.get(f'icl/metrics/eval_gauntlet/{t.name}',
-                                     None)
-                for t in eval_gauntlet_callback.categories
-            })
-            row.update({
-                'average':
-                    composite_scores[f'icl/metrics/eval_gauntlet/average']
-            })
+            row.update(
+                {k.split('/')[-1]: v for k, v in composite_scores.items()})
             eval_gauntlet_df = pd.concat(
                 [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True)
 
             print(f'Printing gauntlet results for all models')
+
             print(
                 eval_gauntlet_df.sort_values(
-                    'average', ascending=False).to_markdown(index=False))
+                    list(eval_gauntlet_callback.averages.keys())[0],
+                    ascending=False).to_markdown(index=False))
         print(f'Printing complete results for all models')
         assert models_df is not None
         print(models_df.to_markdown(index=False))
diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml
index 87e01fd44c..1d2fa34139 100644
--- a/scripts/eval/yamls/eval_gauntlet.yaml
+++ b/scripts/eval/yamls/eval_gauntlet.yaml
@@ -2,6 +2,27 @@ eval_gauntlet:
   weighting: EQUAL
   subtract_random_baseline: true
   rescale_accuracy: true
+  averages:
+    core_average:
+    - world_knowledge
+    - commonsense_reasoning
+    - language_understanding
+    - symbolic_problem_solving
+    - reading_comprehension
+    - programming
+    lm_task_average:
+    - world_knowledge_lm_task_subscore
+    - commonsense_reasoning_lm_task_subscore
+    - language_understanding_lm_task_subscore
+    - symbolic_problem_solving_lm_task_subscore
+    - reading_comprehension_lm_task_subscore
+    lite_average:
+    - world_knowledge_lite
+    - commonsense_reasoning_lite
+    - language_understanding_lite
+    - symbolic_problem_solving_lite
+    - reading_comprehension_lite
+    - programming_lite
   categories:
   - name: world_knowledge
     benchmarks:
diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml
index 05169818d9..759af8239a 100644
--- a/scripts/eval/yamls/hf_eval.yaml
+++ b/scripts/eval/yamls/hf_eval.yaml
@@ -43,5 +43,5 @@ device_eval_batch_size: 4
 #   forward_prefetch: True
 #   limit_all_gathers: True
 
-icl_tasks: 'eval/yamls/tasks.yaml'
+icl_tasks: 'eval/yamls/tasks_light.yaml'
 eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
diff --git a/tests/test_eval.py b/tests/test_eval.py
index ecd15ab62f..1217487b70 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -62,7 +62,7 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any):
     assert isinstance(test_cfg, om.DictConfig)
     main(test_cfg)
     out, _ = capfd.readouterr()
-    expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt '
+    expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt |'
     assert expected_results in out
-    expected_results = '| model_name   |   average |   language_understanding_lite |\n|:-------------|----------:|------------------------------:|\n| tiny_mpt     |         0 |                             0 |'
+    expected_results = '| model_name   |   default_average |   language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt     |                 0 |                             0 |'
     assert expected_results in out
diff --git a/tests/test_eval_gauntlet.py b/tests/test_eval_gauntlet.py
index 8ccdd75766..3a1e371ab8 100644
--- a/tests/test_eval_gauntlet.py
+++ b/tests/test_eval_gauntlet.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import omegaconf as om
 import pytest
@@ -53,7 +53,10 @@ def log_metrics(self, metrics: Dict[str, float]) -> None:
         self.inmemorylogger.log_metrics(metrics)
 
 
-def test_gauntlet_callback():
+@pytest.mark.parametrize('averages', [{
+    'core_average': ['world_knowledge', 'language_understanding']
+}, None])
+def test_gauntlet_callback(averages: Optional[dict]):
     icl_task_config = om.OmegaConf.create("""
             - label: jeopardy_small
               dataset_uri: eval/local_data/world_knowledge/jeopardy_small.jsonl # ADD YOUR OWN DATASET URI
@@ -87,6 +90,9 @@ def test_gauntlet_callback():
           """)
     assert isinstance(eval_gauntlet_config, om.DictConfig) or isinstance(
         eval_gauntlet_config, str)
+
+    if averages is not None:
+        eval_gauntlet_config.averages = averages
     tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
 
     # test loading functionality
@@ -106,4 +112,9 @@ def test_gauntlet_callback():
         name = f'icl/metrics/eval_gauntlet/{category}'
         assert result[name] == pytest.approx(0.25)
 
-    assert result['icl/metrics/eval_gauntlet/average'] == pytest.approx(0.25)
+    if averages is None:
+        assert result[
+            'icl/metrics/eval_gauntlet/default_average'] == pytest.approx(0.25)
+    else:
+        assert result[
+            'icl/metrics/eval_gauntlet/core_average'] == pytest.approx(0.25)
diff --git a/tests/test_training.py b/tests/test_training.py
index 9d40fc2a78..214909cc28 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -6,7 +6,7 @@
 import shutil
 import sys
 from argparse import Namespace
-from typing import Any
+from typing import Any, Optional
 
 import pytest
 from composer.loggers import InMemoryLogger
@@ -114,7 +114,11 @@ def set_correct_cwd():
         os.chdir('..')
 
 
-def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
+@pytest.mark.parametrize('averages', [{
+    'core_average': ['language_understanding_lite']
+}, None])
+def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
+                        tmp_path: pathlib.Path):
     """Test training run with a small dataset."""
     dataset_name = create_c4_dataset_xsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(dataset_name, 'cpu')
@@ -155,6 +159,9 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
             ])
     })
 
+    if averages is not None:
+        test_cfg.eval_gauntlet['averages'] = averages
+
     test_cfg.icl_seq_len = 128
     test_cfg.max_duration = '1ba'
     test_cfg.eval_interval = '1ba'
@@ -167,14 +174,20 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
     inmemorylogger = trainer.logger.destinations[
         0]  # pyright: ignore [reportGeneralTypeIssues]
     assert isinstance(inmemorylogger, InMemoryLogger)
-    assert 'icl/metrics/eval_gauntlet/average' in inmemorylogger.data.keys()
-    assert isinstance(inmemorylogger.data['icl/metrics/eval_gauntlet/average'],
-                      list)
-    assert len(inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1]) > 0
+
+    category_name = 'default_average' if averages is None else 'core_average'
+    assert f'icl/metrics/eval_gauntlet/{category_name}' in inmemorylogger.data.keys(
+    )
     assert isinstance(
-        inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1], tuple)
+        inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'], list)
+    assert len(inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}']
+               [-1]) > 0
+    assert isinstance(
+        inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][-1],
+        tuple)
 
-    assert inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1][-1] == 0
+    assert inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][
+        -1][-1] == 0
 
 
 def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):

From db9227aa47adae117f1b26f8f8bca830935140d0 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 30 Oct 2023 14:06:01 -0700
Subject: [PATCH 10/49] Remove prefixlm support for OPT and Bloom (#704)

---
 .../models/utils/hf_prefixlm_converter.py     | 603 +-----------------
 tests/test_model.py                           |  13 +-
 2 files changed, 6 insertions(+), 610 deletions(-)

diff --git a/llmfoundry/models/utils/hf_prefixlm_converter.py b/llmfoundry/models/utils/hf_prefixlm_converter.py
index fb9477d909..692fab94c2 100644
--- a/llmfoundry/models/utils/hf_prefixlm_converter.py
+++ b/llmfoundry/models/utils/hf_prefixlm_converter.py
@@ -10,31 +10,14 @@
 and treat the input prompt as the prefix in `generate`.
 """
 
-import math
-import warnings
 from types import MethodType
 from typing import Any, List, MutableMapping, Optional, Tuple, Union
 
 import torch
-from transformers.models.bloom.modeling_bloom import (
-    BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel,
-    CausalLMOutputWithCrossAttentions, CrossEntropyLoss)
-from transformers.models.bloom.modeling_bloom import \
-    _expand_mask as _expand_mask_bloom
-from transformers.models.bloom.modeling_bloom import \
-    _make_causal_mask as _make_causal_mask_bloom
-from transformers.models.bloom.modeling_bloom import logging
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
 from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-from transformers.models.opt.modeling_opt import \
-    _expand_mask as _expand_mask_opt
-from transformers.models.opt.modeling_opt import \
-    _make_causal_mask as _make_causal_mask_opt
-
-logger = logging.get_logger(__name__)
 
 _SUPPORTED_GPT_MODELS = (
     GPT2LMHeadModel,
@@ -223,583 +206,10 @@ def generate(self: CAUSAL_GPT_TYPES, *args: Any, **kwargs: Any):
     return model
 
 
-def _convert_bloom_causal_lm_to_prefix_lm(
-        model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-
-    assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
-
-    # Modified from transformers.models.bloom.modeling_bloom.BloomModel._prepare_attn_mask
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/bloom/modeling_bloom.py#L648
-    def _prepare_attn_mask(
-        self: BloomModel,
-        attention_mask: torch.Tensor,
-        bidirectional_mask: Optional[torch.Tensor],
-        input_shape: Tuple[int, int],
-        past_key_values_length: int,
-    ) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask_bloom(
-                input_shape,
-                device=device,
-                past_key_values_length=past_key_values_length)
-            # Make use of the batch-specific `bidirectional_mask` attribute set
-            # by the parent module in its (new) `forward` method wrapper
-            if bidirectional_mask is not None:
-                # The two masks should have the same size
-                assert attention_mask.shape == bidirectional_mask.shape
-
-                # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-                expanded_bidirectional_mask = _expand_mask_bloom(
-                    bidirectional_mask, tgt_length=src_length)
-                combined_attention_mask = torch.logical_and(
-                    combined_attention_mask, expanded_bidirectional_mask)
-
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask_bloom(attention_mask,
-                                                tgt_length=src_length)
-        combined_attention_mask = (expanded_attn_mask
-                                   if combined_attention_mask is None else
-                                   expanded_attn_mask | combined_attention_mask)
-
-        return combined_attention_mask
-
-    # Modified from transformers.models.bloom.modeling_bloom._prepare_alibi_transformer
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/bloom/modeling_bloom.py#L87
-    def _build_alibi_tensor(
-        self: BloomModel,
-        batch_size: int,
-        query_length: int,
-        key_length: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> torch.Tensor:
-        num_heads = self.config.n_head
-
-        closest_power_of_2 = 2**math.floor(math.log2(num_heads))
-        base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-                            device=device,
-                            dtype=torch.float32)
-        powers = torch.arange(1,
-                              1 + closest_power_of_2,
-                              device=device,
-                              dtype=torch.int32)
-        slopes = torch.pow(base, powers)
-
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(
-                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-                device=device,
-                dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2,
-                                      num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1,
-                                        1 + 2 * num_remaining_heads,
-                                        2,
-                                        device=device,
-                                        dtype=torch.int32)
-            slopes = torch.cat(
-                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-        qa = torch.arange(query_length, device=device,
-                          dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device,
-                          dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(
-            1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1,
-                             -1).reshape(-1, query_length, key_length)
-        return alibi.to(dtype)
-
-    # Modified from transformers.models.bloom.modeling_bloom.BloomModel.forward
-    # Note: The modified code is surrounded with #### START/END #### comments
-    # and one new argument (`bidirectional_mask`) is added to the signature.
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def transformer_forward(
-        self: BloomModel,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments: Any
-    ) -> Union[Tuple[torch.Tensor, ...],
-               BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so
-            # defaulting pop to `False` allows to detect if users were
-            # passing explicitly `None`
-            warnings.warn(
-                '`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' +\
-                'You can safely ignore passing `position_ids`.',
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(
-                f'Got unexpected arguments: {deprecated_arguments}')
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))  # type: ignore
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:  # type: ignore
-            tmp = past_key_values[0][0]  # type: ignore
-            past_key_values_length = tmp.shape[2]  # type: ignore
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        ##### ALL NON-SIGNATURE MODIFICATIONS ARE CONTAINED TO THIS BLOCK [STARTS HERE] #####
-        alibi = self._build_alibi_tensor(
-            batch_size=batch_size,
-            query_length=seq_length,
-            key_length=seq_length_with_past,
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
-
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            bidirectional_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-        ##### ALL NON-SIGNATURE MODIFICATIONS ARE CONTAINED TO THIS BLOCK [ENDS HERE] #####
-
-        for i, (block,
-                layer_past) in enumerate(zip(self.h,
-                                             past_key_values)):  # type: ignore
-
-            if output_hidden_states:
-                hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst  # type: ignore
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module: torch.nn.Module):
-
-                    def custom_forward(*inputs: Any):
-                        # None for past_key_value
-                        return module(*inputs,
-                                      use_cache=use_cache,
-                                      output_attentions=output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(  # type: ignore
-                    create_custom_forward(block),
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    head_mask[i],  # type: ignore
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],  # type: ignore
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)  # type: ignore
-
-            if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)  # type: ignore
-                all_self_attentions = all_self_attentions + oa  # type: ignore
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst  # type: ignore
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states, presents, all_hidden_states, all_self_attentions
-            ] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    # Make it so model.transformer has the new helper methods and new
-    # `forward` method
-    setattr(model.transformer, '_prepare_attn_mask',
-            MethodType(_prepare_attn_mask, model.transformer))
-    setattr(model.transformer, '_build_alibi_tensor',
-            MethodType(_build_alibi_tensor, model.transformer))
-    setattr(model.transformer, 'forward',
-            MethodType(transformer_forward, model.transformer))
-
-    # In order to actually use the new argument we've added to
-    # model.transformer, we need to update the parent module's `forward` to
-    # accept/pass the same new argument.
-    # We add 2 lines to handle that change.
-    # Both lines are tagged with "# WE'RE ADDING A NEW ARGUMENT!"
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(
-        self: BloomForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        # WE'RE ADDING A NEW ARGUMENT! (Change 1/2)
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments: Any,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so
-            # defaulting pop to `False` allows to detect if users were passing
-            # explicitly `None`
-            warnings.warn(
-                '`position_ids` have no functionality in BLOOM and will be removed ' +\
-                'in v5.0.0. You can safely ignore passing `position_ids`.',
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(
-                f'Got unexpected arguments: {deprecated_arguments}')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            # WE'RE ADDING A NEW ARGUMENT! (Change 2/2)
-            bidirectional_mask=bidirectional_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size),
-                shift_labels.view(batch_size * seq_length))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    # To handle generation, re-write `prepare_inputs_for_generation` to
-    # implement the bidirectional logic.
-    def prepare_inputs_for_generation(self: BloomForCausalLM,
-                                      input_ids: torch.LongTensor,
-                                      past: Optional[torch.Tensor] = None,
-                                      attention_mask: Optional[
-                                          torch.Tensor] = None,
-                                      **kwargs: Any) -> dict:
-        del kwargs  # unused
-        # only last token for input_ids if past is not None
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)  # type: ignore
-            # We can turn off bidirectional masking after the prefix
-            # has been encoded into `past`
-            bidirectional_mask = None
-
-            # the cache may be in the standard format (e.g. in contrastive
-            # search), convert to bloom's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_bloom_cache(past)
-
-        else:
-            # If we're here, `input_ids` contains the prefix. Encode it with
-            # bidirectional attention.
-            bidirectional_mask = torch.ones_like(input_ids)
-
-        return {
-            'input_ids': input_ids,
-            'past_key_values': past,
-            # "use_cache": kwargs.get("use_cache"),
-            # Requires this. TODO(Alex): Confirm this supports other decoding strategies.
-            'use_cache': True,
-            'attention_mask': attention_mask,
-            'bidirectional_mask': bidirectional_mask,
-        }
-
-    # Register the new `forward` and `prepare_inputs_for_generation` methods
-    # with the model
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'prepare_inputs_for_generation',
-            MethodType(prepare_inputs_for_generation, model))
-
-    # Finally, tag the model so that this conversion cannot happen again.
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-
-def _convert_opt_causal_lm_to_prefix_lm(
-        model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-
-    assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
-
-    # Rename methods to allow:
-    #  - new `forward` to wrap original `forward`
-    #  - new `generate` to wrap original `generate`
-    setattr(model, '_original_forward', getattr(model, 'forward'))
-    setattr(model, '_original_generate', getattr(model, 'generate'))
-
-    model.model.decoder.bidirectional_mask = None
-
-    # Modified from transformers.models.bloom.modeling_opt.OPTDecoder._prepare_decoder_attn_mask
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/opt/modeling_opt.py#L532
-    def _prepare_decoder_attention_mask(self: torch.nn.Module,
-                                        attention_mask: Optional[torch.Tensor],
-                                        input_shape: Tuple[int, int],
-                                        inputs_embeds: Optional[torch.Tensor],
-                                        past_key_values_length: int):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            assert inputs_embeds is not None
-            # 'g' indicates generation mode. Causal mask replaced with 0.
-            if self.bidirectional_mask == 'g':
-                bsz, src_length = input_shape
-                combined_attention_mask = torch.zeros(
-                    (bsz, 1, src_length, src_length + past_key_values_length),
-                    dtype=inputs_embeds.dtype,
-                    device=inputs_embeds.device)
-            else:
-                combined_attention_mask = _make_causal_mask_opt(
-                    input_shape,
-                    inputs_embeds.dtype,
-                    past_key_values_length=past_key_values_length).to(
-                        inputs_embeds.device)
-
-                # Make use of the batch-specific `bidirectional_mask` attribute
-                # set by the parent module in its (new) `forward` method wrapper
-                if self.bidirectional_mask is not None:
-                    assert attention_mask is not None
-                    # The two masks should have the same size
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-
-                    # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-                    expanded_bidirectional_mask = _expand_mask_opt(
-                        self.bidirectional_mask,
-                        inputs_embeds.dtype,
-                        tgt_len=input_shape[-1]).to(inputs_embeds.device)
-                    combined_attention_mask = torch.maximum(
-                        expanded_bidirectional_mask, combined_attention_mask)
-
-        if attention_mask is not None:
-            assert inputs_embeds is not None
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask_opt(attention_mask,
-                                                  inputs_embeds.dtype,
-                                                  tgt_len=input_shape[-1]).to(
-                                                      inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    # Make it so model.model.decoder uses the above `_prepare_decoder_attn_mask`
-    # in place of the original method
-    setattr(model.model.decoder, '_prepare_decoder_attention_mask',
-            MethodType(_prepare_decoder_attention_mask, model.model.decoder))
-
-    def forward(
-        self: OPTForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.ByteTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-
-        def call_og_forward():
-            return self._original_forward(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-        if bidirectional_mask is None:
-            # This wrapper is a no-op if bidirectional masks are not supplied
-            return call_og_forward()
-
-        # Temporarily set `bidirectional_mask` in the child module
-        self.model.decoder.bidirectional_mask = bidirectional_mask
-
-        # Apply the original forward method (the model will use the mask that
-        # was just set)
-        try:
-            outputs = call_og_forward()
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-
-        # Reset the `bidirectional_mask` attribute to None
-        self.model.decoder.bidirectional_mask = None
-
-        # Return the outputs
-        return outputs
-
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Any):
-        """Wraps original generate to enable PrefixLM-style attention."""
-        # Flag the child module to use generation-style attention masking
-        self.model.decoder.bidirectional_mask = 'g'
-
-        # Collect outputs using the model's original forward method
-        try:
-            output = self._original_generate(*args, **kwargs)
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-
-        # Reset the `bidirectional_mask` attribute to None
-        self.model.decoder.bidirectional_mask = None
-
-        # Return the output
-        return output
-
-    # Replace `forward` and `generate` with the new wrappers
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'generate', MethodType(generate, model))
-
-    # Finally, tag the model so that this conversion cannot happen again.
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM,
-                                                OPTForCausalLM)
+_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS
 
 CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM,
-                        GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
+                        GPTNeoXForCausalLM]
 
 
 def convert_hf_causal_lm_to_prefix_lm(
@@ -811,8 +221,6 @@ def convert_hf_causal_lm_to_prefix_lm(
         - `GPTNeoForCausalLM`
         - `GPTNeoXForCausalLM`
         - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
 
     Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
     `generate` method and/or select underlying methods depending on the model class.
@@ -862,13 +270,6 @@ def convert_hf_causal_lm_to_prefix_lm(
     """
     if isinstance(model, _SUPPORTED_GPT_MODELS):
         return _convert_gpt_causal_lm_to_prefix_lm(model)
-
-    elif isinstance(model, BloomForCausalLM):
-        return _convert_bloom_causal_lm_to_prefix_lm(model)
-
-    elif isinstance(model, OPTForCausalLM):
-        return _convert_opt_causal_lm_to_prefix_lm(model)
-
     else:
         raise TypeError(
             f'Cannot convert model to Prefix LM. ' +\
diff --git a/tests/test_model.py b/tests/test_model.py
index 67166bef68..1c7033ed48 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -25,8 +25,7 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.bloom.modeling_bloom import build_alibi_tensor
 
-from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM,
-                        ComposerHFPrefixLM)
+from llmfoundry import COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers import NORM_CLASS_REGISTRY, build_alibi_bias
 from llmfoundry.models.layers.blocks import MPTBlock
@@ -438,11 +437,10 @@ def test_loss_fn():
                                     atol=1e-4), f'differed at step {i}'
 
 
-@pytest.mark.parametrize('prefixlm', [False, True])
-def test_opt_wrapping(prefixlm: bool):
+def test_opt_wrapping():
     conf = {
         'model': {
-            'name': 'hf_prefix_lm' if prefixlm else 'hf_causal_lm',
+            'name': 'hf_causal_lm',
             'pretrained_model_name_or_path': 'facebook/opt-125m',
             'pretrained': 'false'
         },
@@ -456,10 +454,7 @@ def test_opt_wrapping(prefixlm: bool):
     tokenizer = build_tokenizer(config.tokenizer.name,
                                 tokenizer_cfg.get('kwargs', {}))
 
-    if prefixlm:
-        model = ComposerHFPrefixLM(config.model, tokenizer)
-    else:
-        model = ComposerHFCausalLM(config.model, tokenizer)
+    model = ComposerHFCausalLM(config.model, tokenizer)
 
     # check that all the modules we except are blocked from FSDP wrapping
     assert not model.model.model._fsdp_wrap

From e40689f434a5bfa1ef5c261483fb77819324e0b9 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Mon, 30 Oct 2023 15:12:00 -0700
Subject: [PATCH 11/49] Fix attention patch compatibility for llama2 (#705)

---
 .../layers/llama_attention_monkeypatch.py     |  4 ++
 tests/test_huggingface_flash.py               | 50 +++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/llmfoundry/models/layers/llama_attention_monkeypatch.py b/llmfoundry/models/layers/llama_attention_monkeypatch.py
index 88f61e3fef..9ceeb0747e 100644
--- a/llmfoundry/models/layers/llama_attention_monkeypatch.py
+++ b/llmfoundry/models/layers/llama_attention_monkeypatch.py
@@ -78,6 +78,8 @@ def llama_attention_patch_torch(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
+    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(
@@ -186,6 +188,8 @@ def llama_attention_patch_triton(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
+    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(
diff --git a/tests/test_huggingface_flash.py b/tests/test_huggingface_flash.py
index a71217ea1f..834488bb6a 100644
--- a/tests/test_huggingface_flash.py
+++ b/tests/test_huggingface_flash.py
@@ -10,6 +10,7 @@
 import transformers
 from composer.core.precision import get_precision_context
 from composer.utils import reproducibility
+from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
@@ -107,6 +108,55 @@ def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
     assert torch.allclose(attn_output, new_output, atol=atol, rtol=rtol)
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize('patch', ['triton', 'torch'])
+def test_attn_patch_integration(patch: str):
+    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+        pytest.skip(
+            'The CI cluster does not have access to the Llama models, so skip this test.'
+        )
+
+    # Save the original attention function to restore at the end of the test.
+    from transformers.models.llama.modeling_llama import LlamaAttention
+    original_attn = LlamaAttention.forward
+
+    name = 'meta-llama/Llama-2-7b-hf'
+    model_cfg = DictConfig({
+        'name': 'hf_causal_lm',
+        'pretrained_model_name_or_path': name,
+        'config_overrides': {
+            'num_hidden_layers': 2,
+            'intermediate_size': 64,
+        },
+        'use_auth_token': True,
+        'pretrained': False,
+        'init_device': 'cpu',
+        'attention_patch_type': patch
+    })
+
+    tokenizer = build_tokenizer(name, tokenizer_kwargs={})
+    tokenizer.pad_token = tokenizer.eos_token
+
+    model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg, tokenizer)
+
+    tokenized_input = tokenizer(['Hello world blah blah', 'Goodbye world'],
+                                return_tensors='pt',
+                                padding=True)
+    tokenized_input['labels'] = tokenized_input['input_ids'].clone()
+
+    tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()}
+    model.to('cuda')
+
+    with get_precision_context('amp_bf16'):
+        # We're just testing that the attention patch runs okay
+        outputs = model(tokenized_input)
+        loss = outputs.loss
+        loss.backward()
+
+    # Ensure the patch does not persist beyond this test.
+    LlamaAttention.forward = original_attn
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize('model_name', ['llama2', 'mistral'])
 @pytest.mark.parametrize('use_flash_attention_2', [True, False])

From 08f377bc3fac8f95894c8cc5b527d26c1e860bef Mon Sep 17 00:00:00 2001
From: dblalock <davis@mosaicml.com>
Date: Mon, 30 Oct 2023 23:09:00 -0700
Subject: [PATCH 12/49] add test coverage for lion and lion8b checkpoint
 interop (#679)

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 tests/test_lion8b.py | 55 ++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/tests/test_lion8b.py b/tests/test_lion8b.py
index ddb70e882b..0c7010ce9f 100644
--- a/tests/test_lion8b.py
+++ b/tests/test_lion8b.py
@@ -24,6 +24,7 @@
     LocalOptimStateDictConfig = MagicMock()
     ShardedOptimStateDictConfig = MagicMock()
 
+from llmfoundry.optim import DecoupledLionW
 from llmfoundry.optim import DecoupledLionW_8bit as Lion8bit
 
 warnings.filterwarnings('ignore')
@@ -406,8 +407,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:  # type:ignore
 @pytest.mark.parametrize('use_errors', [False, True])
 @pytest.mark.parametrize('state_sharding',
                          [_FULL_STATE, _SHARDED_STATE, _LOCAL_STATE])
+@pytest.mark.parametrize('save_as_lion8b, load_as_lion8b', [(False, True),
+                                                            (True, False),
+                                                            (True, True)])
 def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
-                        state_sharding: fsdp.StateDictType):
+                        state_sharding: fsdp.StateDictType,
+                        save_as_lion8b: bool, load_as_lion8b: bool):
     device = 'cuda'
     if torch.cuda.device_count() < 2:
         pytest.skip(f'This test requires 2+ GPUs.')
@@ -419,6 +424,10 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
         dist.init_process_group(backend='nccl')
     assert dist.get_world_size() >= 2, 'Misconfigured test run!'
 
+    # nb: this is the line that causes:
+    #   `Warning: Deallocating Tensor that still has live PyObject references.`
+    # suggesting this warning isn't an issue with our test code. It's also
+    # going to stdout (probably from cpp) so we can't suppress it with warnings
     mod = FSDP(_DummyModule(device=device, dtype=dtype))
 
     # actual forward pass instead of setting p.grad to avoid FSDP issues
@@ -429,7 +438,10 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
         p.grad = torch.rand_like(p)
 
     # create optimizer and have it step so that state gets populated
-    opt = Lion8bit(mod.parameters(), error_correction=use_errors)
+    if save_as_lion8b:
+        opt = Lion8bit(mod.parameters(), error_correction=use_errors)
+    else:
+        opt = DecoupledLionW(mod.parameters())
     opt.step()
     opt.zero_grad()
 
@@ -449,13 +461,22 @@ def _set_state_dict_type(model: nn.Module):
         FSDP.set_state_dict_type(model, state_sharding, state_dict_cfg,
                                  optim_cfg)
 
+    def _local_shard(t: torch.Tensor) -> torch.Tensor:
+        try:  # can't operate on ShardedTensors directly
+            return t.local_tensor()  # type: ignore
+        except AttributeError:
+            return t
+
     # load FSDP state dict
     _set_state_dict_type(mod)
     opt_state_dict = FSDP.optim_state_dict(mod, opt)
 
     # make a new model and optimizer
     mod_new = FSDP(_DummyModule(device=device, dtype=dtype))
-    opt_new = Lion8bit(mod_new.parameters(), error_correction=use_errors)
+    if load_as_lion8b:
+        opt_new = Lion8bit(mod_new.parameters(), error_correction=use_errors)
+    else:
+        opt_new = DecoupledLionW(mod_new.parameters())
     _set_state_dict_type(mod_new)
 
     # load state dict into the new optimizer
@@ -480,22 +501,26 @@ def _set_state_dict_type(model: nn.Module):
         mom_new = d_new['exp_avg']
 
         assert mom_orig.shape == mom_new.shape
-        assert mom_orig.dtype == mom_new.dtype
-        if use_errors and (dtype != torch.float32):
-            errs_orig = d_orig['errors']
-            errs_new = d_new['errors']
-            assert errs_orig.shape == errs_new.shape
-            assert errs_orig.dtype == errs_new.dtype
-
-        if state_sharding != _FULL_STATE:
-            continue  # more detailed checks lean on FSDP impl details
+        both_lion8b = save_as_lion8b and load_as_lion8b
+        check_errors = both_lion8b and use_errors and (dtype != torch.float32)
+        if both_lion8b:
+            assert mom_orig.dtype == mom_new.dtype
+            if check_errors:
+                errs_orig = d_orig['errors']
+                errs_new = d_new['errors']
+                assert errs_orig.shape == errs_new.shape
+                assert errs_orig.dtype == errs_new.dtype
 
         # momentums may not be bit-for-bit identical because Optimizer upcasts
         # to f32 and we convert back to bf16, possibly with different rounding
-        torch.testing.assert_close(mom_orig, mom_new)
+        torch.testing.assert_close(_local_shard(mom_orig).float(),
+                                   _local_shard(mom_new).float(),
+                                   atol=1e-4,
+                                   rtol=1. / 128)
         # errors not bit-for-bit identical because scales get upcast too
-        if use_errors and (dtype != torch.float32):
-            torch.testing.assert_close(d_orig['errors'], d_new['errors'])
+        if check_errors:
+            torch.testing.assert_close(_local_shard(d_orig['errors']),
+                                       _local_shard(d_new['errors']))
 
 
 @pytest.mark.gpu

From 3eb9717c4b5121e11a18ce5b00ce18d92532cac5 Mon Sep 17 00:00:00 2001
From: S A G A R <110724849+tmsagarofficial@users.noreply.github.com>
Date: Tue, 31 Oct 2023 13:23:53 +0530
Subject: [PATCH 13/49] Improvement in README.md and TUTORIAL.md (#699)

* Update README.md

* Update TUTORIAL.md

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 README.md   |  4 ++--
 TUTORIAL.md | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 1d3f6d5df4..46074613e1 100644
--- a/README.md
+++ b/README.md
@@ -181,14 +181,14 @@ source llmfoundry-venv-amd/bin/activate
 
 # installs
 pip install cmake packaging torch
-pip install -e .  # this installs some things which are not needed but they dont hurt
+pip install -e .  # This installs some things that are not needed but they don't hurt
 pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2
 ```
 **Lastly**, install the ROCm enabled flash attention (instructions [here](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2#amd-gpurocm-support)).
 
 Notes:
 1. `attn_impl: triton` does not work.
-1. We don't yet have a docker img where everything works perfectly. You might need to up/down grade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
+1. We don't yet have a docker img where everything works perfectly. You might need to up/downgrade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
 
 # Quickstart
 
diff --git a/TUTORIAL.md b/TUTORIAL.md
index 36993bc409..d019eb9f83 100644
--- a/TUTORIAL.md
+++ b/TUTORIAL.md
@@ -68,7 +68,7 @@ The Trainer is a pytorch-native object that composes your model, dataset(s), opt
 Spending some time understanding the Composer Trainer is a great way to form a deeper understanding of what the train and eval scripts are doing under the hood.
 
 Composer also comes packaged with the `composer` launcher.
-If you go through our docs, you'll notice that we instruct you to launch the train script (`scripts/train/train.py`) and eval script (`scripts/eval/eval.py`) using the launcher, like so,
+If you go through our docs, you'll notice that we instruct you to launch the training script (`scripts/train/train.py`) and eval script (`scripts/eval/eval.py`) using the launcher, like so,
 
 <!--pytest.mark.skip-->
 ```bash
@@ -81,7 +81,7 @@ The `composer` launcher puts all your GPUs to work by launching the script on a
 ### StreamingDataset
 
 The training script contains logic for building a few different types of dataloaders used for different training tasks.
-Each of these dataloaders are built to work with **streaming datasets**.
+Each of these dataloaders is built to work with **streaming datasets**.
 There are a number of benefits that come from using streaming datasets, from fast, deterministic resumption to easily loading from a mixture of streams at once.
 
 The scripts in `scripts/data_prep/` are your one-stop-shop for converting a local dataset or a dataset on the Hugging Face Hub to our streaming MDS format.
@@ -178,7 +178,7 @@ We address two possible versions of “finetuning” here. For both, you’ll wa
 
 ### Supervised FineTuning and Instruction FineTuning
 
-`scripts/train/` already includes some resources for supervised finetuning. If that’s what you’re interestested in check out
+`scripts/train/` already includes some resources for supervised finetuning. If that’s what you’re interested in check out
 
 1. [**LLM Finetuning from a Local Dataset: A Concrete Example**](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/finetune_example/README.md)
 2. [The YAML which should replicate the process of creating MPT-7B-Instruct from MPT-7b](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml) — You can point this at your own dataset by [following these instructions](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/README.md#Usage)
@@ -228,7 +228,7 @@ After you're done training, you probably want to convert your Composer checkpoin
 > **Note**
 > Pretraining for 10s of billions of tokens is a large job even for a smaller model; you’ll want multiple A100s for this example.
 
-It is conceivable that you would like to train a model *with the same architecture* as a model available in HuggingFace `transformers` but without using those same weights; for example, if you have a large amount of proprietary data, or want to change something about the model that is hard to change after the fact. So, as an example, let’s say you want a version of `gpt2`  but with longer sequence length, say 2048. Using the MPT architecture would give us Flash Attention and ALiBi, allowing us to go much longer; but for this example we stick with 2048. And of course, let’s use 150 tokens/parameter, which is the ratio that MPT-7B used, getting us to 17.55B tokens for our 117M param model.
+It is conceivable that you would like to train a model *with the same architecture* as a model available in HuggingFace `transformers` but without using those same weights; for example, if you have a large amount of proprietary data, or want to change something about the model that is hard to change after the fact. So, as an example, let’s say you want a version of `gpt2`  but with a longer sequence length, say 2048. Using the MPT architecture would give us Flash Attention and ALiBi, allowing us to go much longer; but for this example we stick with 2048. And of course, let’s use 150 tokens/parameter, which is the ratio that MPT-7B used, getting us to 17.55B tokens for our 117M param model.
 
 The first step to training from scratch is to get your pretraining data prepared.  Following [the data preparation README](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/README.md), we convert C4 as follows:
 
@@ -294,25 +294,25 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
 
 - **Long answer:** In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are bandwidth (BW) limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) showed that fusing all operations in Softmax Attention can make the operation much less BW limited.
-Furthermore, integrating a recompuation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.
+Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.
 
   - Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
   - Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
-  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experiance, `triton` is slightly faster than `flash`.
+  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
 
-<!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, durring training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
+<!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) noted that fusing all operations in Softmax Attention can make the operation much less BW limited.
-Furthermore, integrating a recompuation schema decreases the sequence length memory complexity from quadratic to linear enabling practitioners to train transformer networks using much longer sequence lengths.
+Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from quadratic to linear enabling practitioners to train transformer networks using much longer sequence lengths.
 
 Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
 Setting `attn_config.attn_impl=flash` enables flash attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
-Setting `attn_config.attn_impl=triton` enables a flash attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experiance, `triton` is slightly faster than `flash`.
+Setting `attn_config.attn_impl=triton` enables a flash attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
 The majority of our training setups use `triton`. -->
 
 #### Limitations
 - For training, `torch` uses a lot of memory and is slow.
-- `flash` and `triton` cannot return attention weights and therefore cannot be used with methods which require it.
-- `flash` cannot accept an attention bias and therefore cannot be used with methods which require it such as ALiBi.
+- `flash` and `triton` cannot return attention weights and therefore cannot be used with methods that require it.
+- `flash` cannot accept an attention bias and therefore cannot be used with methods that require it such as ALiBi.
 
 #### What is `triton-pre-mlir`?
 - Torch2 installs and requires a specific version of [Triton](https://openai.com/research/triton).
@@ -370,7 +370,7 @@ model:
 ```
 enables [TransformerEngine's LayerNormMLP](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.LayerNormMLP) layer which enables sequence parallelism if configured correctly.
 
-WARNING: `state_dicts` generated with `ffn_type: te_ln_mlp` will NOT directly map to `state_dicts` generated using the default network configurations. We do not have control over how `te.LayerNormMLP` is implemented and therefore cannot reasily reconcile it with the default implementation (or any other implementation).
+WARNING: `state_dicts` generated with `ffn_type: te_ln_mlp` will NOT directly map to `state_dicts` generated using the default network configurations. We do not have control over how `te.LayerNormMLP` is implemented and therefore cannot readily reconcile it with the default implementation (or any other implementation).
 
 ### How expensive is it to build LLMs?
 - Check out our blog post [GPT3-Quality for <$500k](https://www.mosaicml.com/blog/gpt-3-quality-for-500k) for guidance on LLM training times and costs.

From 51a174fea460f5322553b3ea302c7eb54f889f76 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Tue, 31 Oct 2023 08:09:37 -0700
Subject: [PATCH 14/49] Make TiktokenTokenizerWrapper picklable (#700)

---
 llmfoundry/tokenizers/tiktoken.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 10a296497a..650d469ecf 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -50,6 +50,23 @@ def __init__(self,
             raise ImportError(
                 'You need to install tiktoken to use TiktokenTokenizerWrapper.')
 
+        # Workaround to make tiktokenizer picklable.
+        # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
+        # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
+        import copyreg
+        import functools
+
+        from tiktoken import Encoding  # type: ignore (thirdParty)
+
+        def pickle_Encoding(enc: Encoding):
+            return (functools.partial(Encoding,
+                                      enc.name,
+                                      pat_str=enc._pat_str,
+                                      mergeable_ranks=enc._mergeable_ranks,
+                                      special_tokens=enc._special_tokens), ())
+
+        copyreg.pickle(Encoding, pickle_Encoding)
+
         if model_name is not None and encoding_name is not None:
             raise ValueError(
                 'You need to specify either model_name or encoding_name, not both.'

From ac8e023d4534611a9845a2c993bdc91af7b56fbd Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:29:53 -0700
Subject: [PATCH 15/49] Add num_proc to map and filter calls (#706)

---
 llmfoundry/data/finetuning/tasks.py | 8 +++++++-
 tests/test_hf_conversion_script.py  | 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index f2bd0239c8..edbfcc28c7 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -339,14 +339,20 @@ def dataset_mapper(example: Dict):
                 example = preprocessing_fn(example)
             return _tokenize_formatted_example(example, tokenizer)
 
+        detected_cpu_count = os.cpu_count() or 1
+        num_cpus_to_use = max(1, detected_cpu_count - 4)
+
         columns_to_remove = list(dataset[0].keys())
         tokenized_dataset = dataset.map(
             dataset_mapper,
             batched=False,
             remove_columns=columns_to_remove,
+            num_proc=num_cpus_to_use,
         )
         prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len)
+            lambda example: len(example['input_ids']) < max_seq_len,
+            num_proc=num_cpus_to_use,
+        )
 
         examples_removed = len(tokenized_dataset) - len(
             prompt_length_filtered_dataset)
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index d2f203d3a0..d2c2a9e1c9 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -5,7 +5,7 @@
 import os
 import pathlib
 import sys
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 from composer import Trainer
 from composer.loggers import MLFlowLogger
@@ -254,6 +254,7 @@ def test_callback_inits_with_defaults():
 @pytest.mark.parametrize(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
     [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
+@patch('os.cpu_count', MagicMock(return_value=None))
 def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                                          fsdp_state_dict_type: Optional[str],
                                          log_to_mlflow: bool,

From 6c412412da951b22bfaabb1d0f0333bc4b31a6d0 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:38:50 -0700
Subject: [PATCH 16/49] Fix HF local module copy contention with a meta init on
 local rank 0 (#710)

---
 llmfoundry/models/hf/hf_causal_lm.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index eb90b07045..d52633a09b 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import warnings
 from typing import Mapping, Union
 
 # required for loading a python model into composer
@@ -157,6 +158,24 @@ def __init__(self, om_model_config: Union[DictConfig,
             if dist.get_local_rank() != 0 and init_device == 'mixed':
                 om_model_config.pretrained = False
 
+            # If the HuggingFace model is coming from a local folder, Hugging Face copies the modules into the
+            # transformers modules cache. On particular systems, this operation seems to cause contention between
+            # the different processes. To avoid this contention, we first create the model (on meta device) on local rank
+            # zero. This will set up the transformers model cache and avoid the future contention.
+            if dist.get_local_rank() == 0 and os.path.isdir(
+                    om_model_config.pretrained_model_name_or_path):
+                with init_empty_weights(include_buffers=False):
+                    with warnings.catch_warnings():
+                        warnings.simplefilter('ignore', UserWarning)
+                        AutoModelForCausalLM.from_pretrained(
+                            om_model_config.pretrained_model_name_or_path,
+                            trust_remote_code=trust_remote_code,
+                            use_auth_token=use_auth_token,
+                            config=config,
+                        )
+
+            dist.barrier()
+
             # initialize the model on the correct device
             if resolved_init_device == 'cpu':
                 if om_model_config.pretrained:

From ca8e6b5cbb5da78d688ca1862e69f4dc948d866f Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Sat, 4 Nov 2023 19:40:53 -0700
Subject: [PATCH 17/49] Add support for auto packing ratio (#683)

---
 llmfoundry/data/__init__.py                   |   2 +
 llmfoundry/data/dataloader.py                 |  44 +++
 llmfoundry/data/denoising.py                  |  16 +-
 llmfoundry/data/finetuning/dataloader.py      |  50 ++--
 llmfoundry/data/packing.py                    | 277 ++++++++++++------
 mcli/mcli-llama2-finetune.yaml                |   5 +-
 scripts/misc/profile_packing.py               | 100 +++++++
 .../mpt-7b-arc-easy--gpu.yaml                 |   5 +-
 scripts/train/train.py                        |  29 +-
 .../yamls/finetune/1b_local_data_sft.yaml     |   5 +-
 .../train/yamls/finetune/7b_dolly_sft.yaml    |   5 +-
 .../yamls/finetune/mpt-7b_dolly_sft.yaml      |   5 +-
 tests/test_dataloader.py                      |   7 +-
 tests/test_packing.py                         | 191 ++++++++++++
 14 files changed, 587 insertions(+), 154 deletions(-)
 create mode 100644 llmfoundry/data/dataloader.py
 create mode 100644 scripts/misc/profile_packing.py
 create mode 100644 tests/test_packing.py

diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
index c997c865dd..8da436b9b1 100644
--- a/llmfoundry/data/__init__.py
+++ b/llmfoundry/data/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
+from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
                                        build_text_denoising_dataloader)
 from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
@@ -18,4 +19,5 @@
     'build_text_dataloader',
     'NoConcatDataset',
     'ConcatTokensDataset',
+    'build_dataloader',
 ]
diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
new file mode 100644
index 0000000000..12741717be
--- /dev/null
+++ b/llmfoundry/data/dataloader.py
@@ -0,0 +1,44 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dataloader builder utilities."""
+
+from composer import DataSpec
+from omegaconf import DictConfig
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.denoising import build_text_denoising_dataloader
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.text_data import build_text_dataloader
+
+
+def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+                     device_batch_size: int) -> DataSpec:
+    """Builds a dataloader from a config.
+
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer that the model will use.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    """
+    if cfg.name == 'text':
+        return build_text_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'text_denoising':
+        return build_text_denoising_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'finetuning':
+        return build_finetuning_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    else:
+        raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py
index bc41945076..7d497b4efd 100644
--- a/llmfoundry/data/denoising.py
+++ b/llmfoundry/data/denoising.py
@@ -16,7 +16,7 @@
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizerBase
 
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator
 from llmfoundry.data.text_data import (StreamingTextDataset,
                                        get_tokens_per_batch_func)
 from llmfoundry.models import utils
@@ -375,19 +375,25 @@ def build_text_denoising_dataloader(
             cfg.dataset.max_seq_len (int): The maximum length of sequences
                 in the batch. See :class:`MixtureOfDenoisersCollator` docstring
                 for details.
-            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
+            cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes
                 a collator wrapper that packs device_batch_size*packing_ratio
                 raw examples into device_batch_size packed examples. This helps
                 minimize padding while preserving sequence integrity.
                 This adds `sequence_id` to the batch, which indicates which unique
                 sequence each token belongs to.
+
+                If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with
+                zero waste is selected.
+                In practice, this may result in > 0 waste because profiling is done on only a portion
+                of the dataset.
+
                 Note: Using this feature will not change device_batch_size but it
                     will determine the number of raw examples consumed by the dataloader
                     per batch. Some examples may be discarded if they do not fit when
                     packing.
                     Select packing_ratio **carefully** based on the dataset
                     statistics, max_seq_len, and tolerance for discarding samples!
-                    The packing code in `./packing.py` provides a script that can help
+                    The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             See :class:`StreamingTextDataset` for info on other standard config
                 options within `cfg.dataset`.
@@ -419,7 +425,7 @@ def build_text_denoising_dataloader(
             that the dataloader will produce.
 
     Note:
-        You can run the script inside `./packing.py` to quickly test the
+        You can use the script `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -492,7 +498,7 @@ def build_text_denoising_dataloader(
             raise NotImplementedError(
                 'On-the-fly packing is currently only supported for decoder-only formats.'
             )
-        collate_fn = BinPackWrapper(
+        collate_fn = BinPackCollator(
             collator=collate_fn,
             target_batch_size=device_batch_size,
             max_seq_len=cfg.dataset.max_seq_len,
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 2dde563ac6..6e988ac149 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -14,7 +14,7 @@
 
 from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
 from llmfoundry.data.finetuning.tasks import dataset_constructor
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
 from llmfoundry.data.text_data import get_tokens_per_batch_func
 
 log = logging.getLogger(__name__)
@@ -74,20 +74,26 @@ def build_finetuning_dataloader(cfg: DictConfig,
             cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow
                 the collator to trim padding. See :class:`Seq2SeqFinetuningCollator`
                 docstring for details. Default: ``False``.
-            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
-                a collator wrapper that packs `device_batch_size*packing_ratio`
-                raw examples into `device_batch_size` packed examples. This helps
+            cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes
+                a collator wrapper that packs device_batch_size*packing_ratio
+                raw examples into device_batch_size packed examples. This helps
                 minimize padding while preserving sequence integrity.
                 This adds `sequence_id` to the batch, which indicates which unique
                 sequence each token belongs to.
+
+                If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with
+                zero waste is selected.
+                In practice, this may result in > 0 waste because profiling is done on only a portion
+                of the dataset.
+
                 Note: Using this feature will not change device_batch_size but it
                     will determine the number of raw examples consumed by the dataloader
                     per batch. Some examples may be discarded if they do not fit when
                     packing.
-                    Select `packing_ratio` **carefully** based on the dataset
-                    statistics, `max_seq_len`, and tolerance for discarding samples!
-                    The packing code in `../packing.py` provides a script that can help
-                    you choose the best `packing_ratio`.
+                    Select packing_ratio **carefully** based on the dataset
+                    statistics, max_seq_len, and tolerance for discarding samples!
+                    The script `scripts/misc/profile_packing.py` can help
+                    you choose the best packing_ratio.
             cfg.dataset.shuffle (bool): Whether to shuffle the dataset.
             ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
@@ -106,7 +112,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
         A pytorch dataloader
 
     Note:
-        You can run the script inside `../packing.py` to quickly test the
+        You can run the script inside `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -143,7 +149,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
         )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         dl = DataLoader(
             dataset,
@@ -174,7 +180,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
             )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         if cfg.drop_last:
             world_size = dist.get_world_size()
@@ -367,25 +373,33 @@ def _build_hf_dataset_from_remote(
 
 
 def _build_collate_fn(
-    dataset_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+    dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
     device_batch_size: int
-) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]:
+) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]:
+    dataset_cfg = dataloader_cfg.dataset
+    max_seq_len = dataset_cfg.max_seq_len
+
     collate_fn = Seq2SeqFinetuningCollator(
         tokenizer=tokenizer,
-        max_seq_len=dataset_cfg.max_seq_len,
+        max_seq_len=max_seq_len,
         decoder_only_format=dataset_cfg.decoder_only_format,
         allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False),
     )
 
     packing_ratio = dataset_cfg.get('packing_ratio')
+    max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep')
     if packing_ratio is None:
-        if dataset_cfg.get('max_leftover_bins_to_keep') is not None:
+        if max_leftover_bins_to_keep is not None:
             raise ValueError(
                 'dataset.max_leftover_bins_to_keep has been defined, ' +\
                 'but dataset.packing_ratio has not been set. Please set ' +\
                 'the latter to turn on packing or remove the former from the config.')
         return collate_fn, device_batch_size
 
+    if packing_ratio == 'auto':
+        packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer,
+                                           device_batch_size)
+
     if packing_ratio == 1.0:
         return collate_fn, device_batch_size
     elif packing_ratio < 1.0:
@@ -396,13 +410,13 @@ def _build_collate_fn(
             'On-the-fly packing is currently only supported for decoder-only formats.'
         )
 
-    collate_fn = BinPackWrapper(
+    collate_fn = BinPackCollator(
         collator=collate_fn,
         target_batch_size=device_batch_size,
-        max_seq_len=dataset_cfg.max_seq_len,
+        max_seq_len=max_seq_len,
         pad_token_id=tokenizer.pad_token_id,
         padding_side=tokenizer.padding_side,
-        max_leftover_bins_to_keep=dataset_cfg.get('max_leftover_bins_to_keep'),
+        max_leftover_bins_to_keep=max_leftover_bins_to_keep,
     )
     n_examples_to_pack = int(device_batch_size * packing_ratio)
     return collate_fn, n_examples_to_pack
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 1532de276e..1ae9efcce5 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -1,8 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
@@ -10,7 +9,7 @@
 from transformers import PreTrainedTokenizerBase
 
 
-class BinPackWrapper:
+class BinPackCollator:
     """Utility collator for packing to reduce padding."""
 
     def __init__(self,
@@ -33,13 +32,10 @@ def __init__(self,
         if self.pad_token_id < 0:
             raise ValueError(f'{pad_token_id=} must be >=0.')
 
-        if max_leftover_bins_to_keep is None:
-            self.max_leftover_bins_to_keep = int(10 * self.out_size)
-        elif max_leftover_bins_to_keep < 0:
+        if max_leftover_bins_to_keep is not None and max_leftover_bins_to_keep < 0:
             raise ValueError(
                 f'{max_leftover_bins_to_keep=} must be >=0 or None.')
-        else:
-            self.max_leftover_bins_to_keep = int(max_leftover_bins_to_keep)
+        self.max_leftover_bins_to_keep = max_leftover_bins_to_keep
 
         self.n_packed_tokens = 0
         self.n_total_tokens = 0
@@ -60,7 +56,9 @@ def __call__(
             self,
             examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
         batch = self.base_collator(examples)
+        return self.pack(batch)
 
+    def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         assert 'attention_mask' in batch
         assert 'input_ids' in batch
 
@@ -75,12 +73,12 @@ def __call__(
         # Cut everything down to size
         sizes, trimmed_examples = [], []
         for idx in range(batch['attention_mask'].shape[0]):
-            size, trimmed_example = extract_trim_batch_idx(batch, idx)
+            size, trimmed_example = _extract_trim_batch_idx(batch, idx)
             sizes.append(size)
             trimmed_examples.append(trimmed_example)
 
         # Apply our CS 101 bin packing algorithm.
-        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = first_fit_bin_packing(
+        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing(
             sizes=sizes,
             examples=trimmed_examples,
             num_bins=self.out_size,
@@ -93,15 +91,15 @@ def __call__(
         self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep]
 
         # Re-pad to max_seq_len and batch
-        batch = repad(packed_examples,
-                      max_seq_len=self.max_seq_len,
-                      pad_token_id=self.pad_token_id,
-                      padding_side=self.padding_side)
+        batch = _repad(packed_examples,
+                       max_seq_len=self.max_seq_len,
+                       pad_token_id=self.pad_token_id,
+                       padding_side=self.padding_side)
         return batch
 
 
-def extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
-                           idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
+def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
+                            idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
     example = {k: v[idx] for k, v in batch.items()}
 
     keep = example['attention_mask'] == 1
@@ -112,7 +110,7 @@ def extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
     return size, trim_example
 
 
-def combine_in_place(
+def _combine_in_place(
         example: Dict[str, torch.Tensor],
         add_on: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     if 'labels' in add_on:
@@ -129,7 +127,7 @@ def combine_in_place(
     return example
 
 
-def first_fit_bin_packing(
+def _first_fit_bin_packing(
     sizes: List[int], examples: List[Dict[str, torch.Tensor]], num_bins: int,
     max_bin_size: int, existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]]
 ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[
@@ -194,7 +192,7 @@ def first_fit_bin_packing(
             if bins[bidx][0] + size <= max_bin_size:
                 bin_size, packed_example = bins.pop(bidx)
                 bin_size = bin_size + size
-                packed_example = combine_in_place(packed_example, example)
+                packed_example = _combine_in_place(packed_example, example)
                 bins.append((bin_size, packed_example))
                 added = True
                 break
@@ -225,8 +223,8 @@ def first_fit_bin_packing(
         bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:]
 
 
-def repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int,
-          pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
+def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int,
+           pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
 
     def pad_tensor(tensor: torch.Tensor, pad_value: int):
         if len(tensor) == max_seq_len:
@@ -260,14 +258,168 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int):
     return batch
 
 
+def auto_packing_ratio(dataloader_cfg: DictConfig,
+                       tokenizer: PreTrainedTokenizerBase,
+                       device_batch_size: int,
+                       num_packing_ratios: int = 20) -> float:
+    """Find a packing ratio that minimizes padding with zero waste.
+
+    By packing examples, we can increase training efficiency, training on more data with less batches.
+    However, in practice, the selected packing_ratio may produce some waste because profiling is done on only
+    a subset of the dataset.
+
+    We select a min_ratio of 1 and a max_ratio that is the max_seq_len / 100, and profile up to
+    num_packing_ratios packing ratios between min_ratio and max_ratio, inclusive.
+    When a packing_ratio with non-zero waste is found, we stop and select the previous ratio,
+    which has zero waste.
+
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+        num_packing_ratio (int): The number of packing ratios to try.
+
+    Returns:
+        A packing ratio that minimizes padding while maintaining zero waste.
+    """
+    from composer.utils import dist, get_device, reproducibility
+
+    # Stash the rng state to restore later.
+    rng_state = reproducibility.get_rng_state()
+    # Set the seed so that auto packing is deterministic.
+    reproducibility.seed_all(0)
+
+    min_ratio = 1
+    max_ratio = dataloader_cfg.dataset.max_seq_len / 100
+    profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio,
+                                        max_ratio, num_packing_ratios,
+                                        device_batch_size)
+
+    # Obtain the maximum packing_ratio/minimum padding that has no waste.
+    # profiling_results are sorted from smallest to largest packing_ratio.
+    packing_ratio = 1
+    for packing_ratio_candidate, _, waste in profiling_results:
+        if waste > 0:
+            break
+        packing_ratio = packing_ratio_candidate
+
+    # Select the minimum packing ratio across all ranks.
+    if dist.is_available() and dist.is_initialized():
+        device = get_device(None)
+        packing_ratio_tensor = device.tensor_to_device(
+            torch.tensor(packing_ratio))
+        dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN')
+        packing_ratio = packing_ratio_tensor.item()
+
+    # Restore rng state.
+    reproducibility.load_rng_state(rng_state)
+
+    return packing_ratio
+
+
+def profile_packing(
+        dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+        min_ratio: float, max_ratio: float, num_packing_ratios: int,
+        device_batch_size: int) -> Iterable[Tuple[float, float, float]]:
+    """Generator function that profiles example packing across packing ratios.
+
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        min_ratio (float): Smallest packing_ratio to test. Must be >=1.
+        max_ratio (float): Largest packing_ratio to test. Must be larger than `min_ratio`.
+        num_packing_ratios (int): Number of packing_ratio values (spaced between `min_ratio` and `max_ratio`) to try.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+
+    Returns:
+        An iterable of tuples of packing ratio, padding, and waste, sorted by smallest to largest packing ratio.
+    """
+    import copy
+
+    from llmfoundry.data.dataloader import build_dataloader
+
+    max_seq_len = dataloader_cfg.dataset.get('max_seq_len')
+    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep',
+                                                       None)
+
+    # Turn off packing for the dataloader (we want raw, pre-packed examples)
+    dataloader_cfg = copy.deepcopy(dataloader_cfg)
+    dataloader_cfg.dataset.packing_ratio = None
+    dataloader_cfg.drop_last = False
+    dataloader_cfg.num_workers = 0
+    dataloader_cfg.prefetch_factor = None
+
+    # Determine the packing_ratio values we'll try
+    packing_ratios, raw_batch_sizes = [], []
+    for packing_ratio in np.linspace(min_ratio,
+                                     max_ratio,
+                                     num_packing_ratios,
+                                     endpoint=True):
+        packing_ratio = np.round(10 * packing_ratio) / 10
+        raw_batch_size = int(packing_ratio * device_batch_size)
+        if raw_batch_size not in raw_batch_sizes:
+            packing_ratios.append(packing_ratio)
+            raw_batch_sizes.append(raw_batch_size)
+
+    n_profile_examples = max(raw_batch_sizes) * 100
+
+    train_dataspec = build_dataloader(dataloader_cfg, tokenizer,
+                                      n_profile_examples)
+    train_dataloader = train_dataspec.dataloader
+
+    # Get a bunch of raw examples
+    big_batch = next(iter(train_dataloader))
+
+    def split_big_batch(raw_batch_size: int) -> List:
+        input_ids = big_batch['input_ids'].split(raw_batch_size)
+        batches = [{'input_ids': x} for x in input_ids]
+
+        for key in big_batch.keys():
+            if key == 'input_ids':
+                continue
+            for idx, split in enumerate(big_batch[key].split(raw_batch_size)):
+                batches[idx].update({key: split})
+        return batches
+
+    def profile(raw_batch_size: int) -> Tuple[float, float]:
+        packer = BinPackCollator(
+            collator=lambda x: x,
+            target_batch_size=device_batch_size,
+            max_seq_len=max_seq_len,
+            pad_token_id=0,  # <-- Doesn't need to be correct for profiling
+            padding_side='left',  # <-- Doesn't need to be correct for profiling
+            max_leftover_bins_to_keep=max_leftovers_to_keep)
+
+        # Simulate feeding the packing collator a bunch of data
+        for batch in split_big_batch(raw_batch_size):
+            if batch['input_ids'].shape[0] < device_batch_size:
+                continue
+            _ = packer.pack(batch)
+
+        # Return the padding / waste stats over that bunch of data
+        padding_percent = 100 * (1 - packer.efficiency)
+        waste_percent = 100 * packer.waste
+        return padding_percent, waste_percent
+
+    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
+        padding, waste = profile(raw_batch_size)
+        yield (packing_ratio, padding, waste)
+
+
 if __name__ == '__main__':
+
+    import warnings
+
+    warnings.warn(
+        DeprecationWarning(
+            'Please use scripts/misc/profile_packing.py to profile packing.' +
+            'This script will be removed in later releases.'))
+
+    import os
     from argparse import ArgumentParser, Namespace
 
     from omegaconf import OmegaConf as om
 
-    from llmfoundry import (build_finetuning_dataloader,
-                            build_text_denoising_dataloader)
-    from llmfoundry.data import build_text_dataloader
     from llmfoundry.utils import build_tokenizer
 
     def parse_args() -> Namespace:
@@ -296,7 +448,7 @@ def parse_args() -> Namespace:
         parser.add_argument(
             '--num-packing-ratios',
             type=int,
-            default=10,
+            default=20,
             help=
             'Number of packing_ratio values (spaced between `min` and `max) to try.'
         )
@@ -316,20 +468,6 @@ def parse_args() -> Namespace:
             raise ValueError('`num_packing_ratios` must be a positive integer.')
         return args
 
-    def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
-                         device_batch_size: int):
-        if cfg.name == 'text':
-            return build_text_dataloader(cfg, tokenizer, device_batch_size)
-        elif cfg.name == 'text_denoising':
-            return build_text_denoising_dataloader(cfg, tokenizer,
-                                                   device_batch_size)
-        elif cfg.name == 'finetuning':
-            return build_finetuning_dataloader(cfg, tokenizer,
-                                               device_batch_size)
-        else:
-            raise ValueError(
-                f'Not sure how to build dataloader with config: {cfg}')
-
     args = parse_args()
 
     with open(args.yaml_path) as f:
@@ -339,26 +477,11 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
         cfg = om.create(cfg)
     device_batch_size = cfg.global_train_batch_size // args.num_devices
 
-    # Determine the packing_ratio values we'll try
-    packing_ratios, raw_batch_sizes = [], []
-    for packing_ratio in np.linspace(args.min,
-                                     args.max,
-                                     args.num_packing_ratios,
-                                     endpoint=True):
-        packing_ratio = np.round(10 * packing_ratio) / 10
-        raw_batch_size = int(packing_ratio * device_batch_size)
-        if raw_batch_size not in raw_batch_sizes:
-            packing_ratios.append(packing_ratio)
-            raw_batch_sizes.append(raw_batch_size)
-
     # Fetch a bunch of raw examples once, which we'll re-use
     if 'train_loader' not in cfg:
         raise ValueError('config must define train_loader')
     dataloader_cfg = cfg.train_loader
 
-    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep',
-                                                       None)
-
     # build tokenizer
     if 'tokenizer' not in cfg:
         raise ValueError('config must define tokenizer')
@@ -367,57 +490,19 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
     if not isinstance(resolved_tokenizer_cfg, Dict):
         raise ValueError(
             'tokenizer config needs to be resolved by omegaconf into a Dict.')
-    tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg
+    tokenizer_cfg = resolved_tokenizer_cfg
 
     tokenizer_name = tokenizer_cfg['name']
     tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
     tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
 
-    # Turn off packing for the dataloader (we want raw, pre-packed examples)
-    dataloader_cfg.dataset.packing_ratio = None
-    dataloader_cfg.dataset.max_leftovers_to_keep = None
-    train_dataloader = build_dataloader(dataloader_cfg, tokenizer,
-                                        max(raw_batch_sizes) * 100).dataloader
-
-    # Get a bunch of raw examples
-    big_batch = next(iter(train_dataloader))
-
-    def split_big_batch(raw_batch_size: int) -> List:
-        input_ids = big_batch['input_ids'].split(raw_batch_size)
-        batches = [{'input_ids': x} for x in input_ids]
-
-        for key in big_batch.keys():
-            if key == 'input_ids':
-                continue
-            for idx, split in enumerate(big_batch[key].split(raw_batch_size)):
-                batches[idx].update({key: split})
-        return batches
-
-    def profile_packing(raw_batch_size: int) -> Tuple[float, float]:
-        packer = BinPackWrapper(
-            collator=lambda x: x,
-            target_batch_size=device_batch_size,
-            max_seq_len=dataloader_cfg.dataset.max_seq_len,
-            pad_token_id=0,  # <-- Doesn't need to be correct for profiling
-            padding_side='left',  # <-- Doesn't need to be correct for profiling
-            max_leftover_bins_to_keep=max_leftovers_to_keep)
-
-        # Simulate feeding the packing collator a bunch of data
-        for batch in split_big_batch(raw_batch_size):
-            if batch['input_ids'].shape[0] < device_batch_size:
-                continue
-            _ = packer(batch)
-
-        # Return the padding / waste stats over that bunch of data
-        padding_percent = 100 * (1 - packer.efficiency)
-        waste_percent = 100 * packer.waste
-        return padding_percent, waste_percent
+    results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max,
+                              args.num_packing_ratios, device_batch_size)
 
     header = '\n\n\n packing_ratio | % PADDING | % WASTE'
     fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
 
     print(header)
     print('-' * len(header))
-    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
-        padding, waste = profile_packing(raw_batch_size)
+    for packing_ratio, padding, waste in results:
         print(fstr.format(packing_ratio, padding, waste))
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index ae8f57abb6..93d46f57e3 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -56,7 +56,10 @@ parameters:
       allow_pad_trimming: false
       decoder_only_format: true
       shuffle: true
-      # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+      # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+      # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+      # # of the dataset.
+      # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
       # # to profile this run's optimal packing_ratio as it depends on GPU count,
       # # batch size, sequence length
       # packing_ratio:
diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py
new file mode 100644
index 0000000000..51841d669e
--- /dev/null
+++ b/scripts/misc/profile_packing.py
@@ -0,0 +1,100 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Script to profile example packing."""
+import os
+from typing import Dict
+
+from llmfoundry.data.packing import profile_packing
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser, Namespace
+
+    from omegaconf import OmegaConf as om
+
+    from llmfoundry.utils import build_tokenizer
+
+    def parse_args() -> Namespace:
+        """Parse commandline arguments."""
+        parser = ArgumentParser(
+            description=
+            'Profile packing_ratio choices for a particular workload.')
+        parser.add_argument(
+            '--yaml-path',
+            type=str,
+            required=True,
+            help='Path to the YAML that defines the workload to profile.')
+        parser.add_argument('--num-devices',
+                            type=int,
+                            default=None,
+                            help='How many devices your run will use.')
+        parser.add_argument('--min',
+                            type=float,
+                            required=True,
+                            help='Smallest packing_ratio to test. Must be >=1.')
+        parser.add_argument(
+            '--max',
+            type=float,
+            required=True,
+            help='Largest packing_ratio to test. Must be larger than `min`.')
+        parser.add_argument(
+            '--num-packing-ratios',
+            type=int,
+            default=20,
+            help=
+            'Number of packing_ratio values (spaced between `min` and `max) to try.'
+        )
+
+        args = parser.parse_args()
+
+        if not os.path.isfile(args.yaml_path):
+            raise FileNotFoundError(
+                '`yaml_path` does not correspond to any existing file.')
+        if args.num_devices < 1:
+            raise ValueError('`num_devices` must be a positive integer.')
+        if args.min < 1.0:
+            raise ValueError('`min` must be >=1.0.')
+        if args.max < args.min:
+            raise ValueError('`max` cannot be less than `min`.')
+        if args.num_packing_ratios < 1:
+            raise ValueError('`num_packing_ratios` must be a positive integer.')
+        return args
+
+    args = parse_args()
+
+    with open(args.yaml_path) as f:
+        cfg = om.load(f)
+    if 'parameters' in cfg:
+        cfg = om.to_container(cfg.parameters)
+        cfg = om.create(cfg)
+    device_batch_size = cfg.global_train_batch_size // args.num_devices
+
+    # Fetch a bunch of raw examples once, which we'll re-use
+    if 'train_loader' not in cfg:
+        raise ValueError('config must define train_loader')
+    dataloader_cfg = cfg.train_loader
+
+    # build tokenizer
+    if 'tokenizer' not in cfg:
+        raise ValueError('config must define tokenizer')
+
+    resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True)
+    if not isinstance(resolved_tokenizer_cfg, Dict):
+        raise ValueError(
+            'tokenizer config needs to be resolved by omegaconf into a Dict.')
+    tokenizer_cfg = resolved_tokenizer_cfg
+
+    tokenizer_name = tokenizer_cfg['name']
+    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+
+    results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max,
+                              args.num_packing_ratios, device_batch_size)
+
+    header = '\n\n\n packing_ratio | % PADDING | % WASTE'
+    fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
+
+    print(header)
+    print('-' * len(header))
+    for packing_ratio, padding, waste in results:
+        print(fstr.format(packing_ratio, padding, waste))
diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
index 2c3fb11496..ed2e9fcac0 100644
--- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
+++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
@@ -41,7 +41,10 @@ train_loader:
     shuffle: true
     max_seq_len: ${max_seq_len}
     decoder_only_format: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/train.py b/scripts/train/train.py
index e29f2c9a47..60ee55955e 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -24,9 +24,8 @@
 from transformers import PreTrainedTokenizerBase
 
 from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM,
-                        MPTForCausalLM, build_finetuning_dataloader,
-                        build_text_denoising_dataloader)
-from llmfoundry.data.text_data import build_text_dataloader
+                        MPTForCausalLM)
+from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.utils.builders import (build_algorithm, build_callback,
                                        build_icl_data_and_gauntlet,
                                        build_logger, build_optimizer,
@@ -169,30 +168,6 @@ def print_trainable_parameters(model: torch.nn.Module) -> None:
     )
 
 
-def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
-                     device_batch_size: int):
-    if cfg.name == 'text':
-        return build_text_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    elif cfg.name == 'text_denoising':
-        return build_text_denoising_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    elif cfg.name == 'finetuning':
-        return build_finetuning_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    else:
-        raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
-
-
 def main(cfg: DictConfig) -> Trainer:
     # Filter deprecation warning from torch internal usage
     warnings.filterwarnings(
diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
index 45dca2f1e0..d6f72b0c8e 100644
--- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml
+++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
@@ -49,7 +49,10 @@ train_loader: &train_loader
     allow_pad_trimming: false
     decoder_only_format: true
     shuffle: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
index 6483dd31f5..c5813235d9 100644
--- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
@@ -41,7 +41,10 @@ train_loader:
     allow_pad_trimming: false
     decoder_only_format: true
     shuffle: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
index 9686317bef..2f23d8e55a 100644
--- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
@@ -31,7 +31,10 @@ train_loader:
     max_seq_len: ${max_seq_len}
     allow_pad_trimming: false
     decoder_only_format: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
index 656b6d52a6..2080ec32ec 100644
--- a/tests/test_dataloader.py
+++ b/tests/test_dataloader.py
@@ -8,7 +8,7 @@
 import sys
 import tempfile
 from argparse import Namespace
-from typing import Optional
+from typing import Literal, Optional, Union
 from unittest.mock import MagicMock
 
 import pytest
@@ -248,10 +248,11 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool,
 
 @pytest.mark.parametrize('decoder_only_format', [True, False])
 @pytest.mark.parametrize('allow_pad_trimming', [True, False])
-@pytest.mark.parametrize('packing_ratio', [10.0, None])
+@pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto'])
 def test_finetuning_dataloader(decoder_only_format: bool,
                                allow_pad_trimming: bool,
-                               packing_ratio: Optional[float]):
+                               packing_ratio: Optional[Union[float,
+                                                             Literal['auto']]]):
     # Use the datasets just built in the last test
     tokenizer_name = 'gpt2' if decoder_only_format else 't5-base'
     max_seq_len = 2048 if decoder_only_format else 1024
diff --git a/tests/test_packing.py b/tests/test_packing.py
new file mode 100644
index 0000000000..cbeca8b7b1
--- /dev/null
+++ b/tests/test_packing.py
@@ -0,0 +1,191 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from composer.utils import dist, reproducibility
+from omegaconf import DictConfig
+from pytest import approx
+from torch.utils.data import DataLoader
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def _data_to_batch(data: List[List[int]], max_seq_len: int,
+                   pad_token_id: int) -> Dict[str, torch.Tensor]:
+    """Helper function to create a proper batch of data."""
+    input_ids = torch.stack([
+        torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data
+    ])
+
+    attention_mask = torch.stack([
+        torch.tensor([1] * len(d) + [pad_token_id] * (max_seq_len - len(d)))
+        for d in data
+    ])
+    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+def test_packing():
+    """Tests that packing works for a single batch."""
+    pad_token_id = 0
+    max_seq_len = 5
+    packer = BinPackCollator(collator=lambda x: x,
+                             target_batch_size=2,
+                             max_seq_len=max_seq_len,
+                             pad_token_id=pad_token_id,
+                             padding_side='right')
+
+    batch = _data_to_batch([
+        [1],
+        [2] * 2,
+        [4] * 4,
+        [3] * 3,
+    ], max_seq_len, pad_token_id)
+
+    packed_samples = packer.pack(batch)
+
+    assert torch.equal(packed_samples['input_ids'],
+                       torch.Tensor([[3, 3, 3, 2, 2], [4, 4, 4, 4, 1]]))
+    assert torch.all(packed_samples['attention_mask'] == 1)
+
+
+def test_packing_with_leftovers():
+    """Tests that packing handles leftovers and computes waste correctly."""
+    pad_token_id = 0
+    max_seq_len = 5
+    packer = BinPackCollator(collator=lambda x: x,
+                             target_batch_size=2,
+                             max_seq_len=max_seq_len,
+                             pad_token_id=pad_token_id,
+                             padding_side='right')
+
+    batch = _data_to_batch([
+        [1],
+        [2] * 2,
+        [4] * 4,
+        [4] * 4,
+    ], max_seq_len, pad_token_id)
+
+    packed_batch = packer.pack(batch)
+
+    assert torch.equal(packed_batch['input_ids'],
+                       torch.Tensor([[4, 4, 4, 4, 1], [4, 4, 4, 4, 0]]))
+    assert torch.equal(packed_batch['attention_mask'],
+                       torch.Tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]))
+
+    # Check leftovers and waste.
+    assert len(packer._leftover_bins) == 1
+    leftover_size, leftover = packer._leftover_bins[0]
+    assert leftover_size == 2
+    assert torch.equal(leftover['input_ids'], torch.Tensor([2, 2]))
+    assert torch.equal(leftover['attention_mask'], torch.Tensor([1, 1]))
+    assert packer.waste == approx(2 / 11)  # 2 tokens wasted of 11 tokens total
+
+    # Ensure that leftovers are used in the next batch if possible.
+    batch = _data_to_batch([[1]], max_seq_len, pad_token_id)
+    packed_batch = packer.pack(batch)
+    assert torch.equal(packed_batch['input_ids'],
+                       torch.Tensor([[2, 2, 0, 0, 0], [1, 0, 0, 0, 0]]))
+    assert torch.equal(packed_batch['attention_mask'],
+                       torch.Tensor([[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]]))
+
+
+@patch('llmfoundry.data.packing.profile_packing')
+def test_auto_packing(profile_packing: Mock):
+    """Tests that auto packing selects the highest packing ratio with zero.
+
+    waste.
+    """
+    # List of tuples of packing_ratio, padding, waste, sorted by packing ratio
+    profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)]
+
+    packing_ratio = auto_packing_ratio(
+        dataloader_cfg=DictConfig({'dataset': {
+            'max_seq_len': 2048
+        }}),
+        tokenizer=None,
+        device_batch_size=1,
+    )  # Dummy values, profiling results are already set.
+
+    # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0.
+    assert packing_ratio == 2
+
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+@patch('llmfoundry.data.packing.profile_packing')
+def test_dist_auto_packing(profile_packing: Mock):
+    """Tests that auto packing works with world size > 1."""
+    dist.initialize_dist('gpu')
+
+    # List of tuples of packing_ratio, padding, waste, sorted by packing ratio
+    if dist.get_global_rank() == 0:
+        profile_packing.return_value = [(1, .9, 0), (2, .8, 0),
+                                        (3, .7, 0)]  # should pick 3
+    else:
+        profile_packing.return_value = [(1, .9, 0), (2, .8, 0),
+                                        (3, .7, .5)]  # should pick 2
+
+    packing_ratio = auto_packing_ratio(
+        dataloader_cfg=DictConfig({'dataset': {
+            'max_seq_len': 2048
+        }}),
+        tokenizer=None,
+        device_batch_size=1,
+    )  # Dummy values, profiling results are already set.
+
+    # auto packing ratio should choose 2 because it's the minimum between ranks.
+    assert packing_ratio == 2
+
+
+@pytest.mark.parametrize('packing_ratio', ['auto', 2.0])
+def test_packing_with_dataloader(packing_ratio: Any):
+    """Tests that packing works with a dataloader."""
+    reproducibility.seed_all(17)
+    tokenizer = build_tokenizer('gpt2', {})
+    cfg = DictConfig({
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': 'tatsu-lab/alpaca',
+            'split': 'train',
+            'max_seq_len': 2048,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': packing_ratio,
+            'shuffle': False,
+        },
+        'drop_last': False,
+        # Need to test with 0 num_workers because the packing collator object
+        # Gets copied per worker and we cannot check the waste for child processes.
+        'num_workers': 0,
+        'pin_memory': False,
+        'prefetch_factor': None,
+        'persistent_workers': False,
+        'timeout': 0,
+    })
+
+    loader = build_finetuning_dataloader(cfg, tokenizer,
+                                         device_batch_size=6).dataloader
+
+    assert isinstance(loader, DataLoader)
+    pack_collator = loader.collate_fn
+    assert isinstance(pack_collator, BinPackCollator)
+
+    batch_ix = 0
+    for _ in loader:
+        batch_ix += 1
+        if batch_ix >= 3:
+            break
+
+    padding = (1 - pack_collator.efficiency)
+    if packing_ratio == 'auto':
+        assert pack_collator.waste == approx(0)
+        assert padding == approx(0.1197916, rel=.01)
+    else:
+        assert pack_collator.waste == approx(0)
+        assert padding == approx(0.873720, rel=.01)

From be467aee1744566e46e9b993c1ff23ab01fe5c55 Mon Sep 17 00:00:00 2001
From: Theresa Barton <tbarton16@gmail.com>
Date: Mon, 6 Nov 2023 10:42:26 -0800
Subject: [PATCH 18/49] Remove HumanEval tasks from ICL eval (#715)

* add params

* fix fsdp config

* update

* change model config

* comment out human eval

* remove boolq

* actually remove real boolq

* unroll boolq

* really actully comment out humaneval

* remove file

* lint fix

* lint fix
---
 scripts/eval/yamls/eval_gauntlet.yaml |  57 ++++++------
 scripts/eval/yamls/tasks.yaml         | 128 +++++++++++++-------------
 2 files changed, 90 insertions(+), 95 deletions(-)

diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml
index 1d2fa34139..791023abcf 100644
--- a/scripts/eval/yamls/eval_gauntlet.yaml
+++ b/scripts/eval/yamls/eval_gauntlet.yaml
@@ -133,32 +133,32 @@ eval_gauntlet:
     - name: boolq
       num_fewshot: 10
       random_baseline: 0.5
-  - name: programming
-    benchmarks:
-    - name: human_eval
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_cpp
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_js
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_return_simple
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_return_complex
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_25
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_50
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_75
-      num_fewshot: 0
-      random_baseline: 0.0
+  # - name: programming
+  #   benchmarks:
+  #   - name: human_eval
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_cpp
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_js
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_return_simple
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_return_complex
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_25
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_50
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_75
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
   - name: world_knowledge_lm_task_subscore
     benchmarks:
     - name: jeopardy
@@ -258,8 +258,3 @@ eval_gauntlet:
     - name: squad
       num_fewshot: 10
       random_baseline: 0
-  - name: programming_lite
-    benchmarks:
-    - name: human_eval
-      num_fewshot: 0
-      random_baseline: 0.0
diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml
index 6b66c116ea..737b08ebeb 100644
--- a/scripts/eval/yamls/tasks.yaml
+++ b/scripts/eval/yamls/tasks.yaml
@@ -173,67 +173,67 @@ icl_tasks:
   num_fewshot: [10]
   icl_task_type: multiple_choice
   continuation_delimiter: "\nAnswer: " # this separates questions from answers
--
-  label: human_eval
-  dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_cpp
-  dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_js
-  dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_simple
-  dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_complex
-  dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_25
-  dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_50
-  dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_75
-  dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
+# -
+#   label: human_eval
+#   dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_cpp
+#   dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_js
+#   dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_return_simple
+#   dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_return_complex
+#   dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_25
+#   dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_50
+#   dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_75
+#   dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation

From ffb58f18db01da470720366c649f7a267b9c27a5 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 6 Nov 2023 13:16:12 -0800
Subject: [PATCH 19/49] Allow logging metadata (#714)

* metadata

* precommit

* add to config for other exp trackers

* fix

* pop off of config
---
 scripts/train/train.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/train/train.py b/scripts/train/train.py
index 60ee55955e..88f776375f 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -383,6 +383,12 @@ def main(cfg: DictConfig) -> Trainer:
                                                           'compile_config',
                                                           must_exist=False,
                                                           default_value=None)
+    metadata: Optional[Dict[str, str]] = pop_config(cfg,
+                                                    'metadata',
+                                                    must_exist=False,
+                                                    default_value=None,
+                                                    convert=True)
+
     # Enable autoresume from model checkpoints if possible
     autoresume_default: bool = False
     if logged_cfg.get('run_name', None) is not None \
@@ -460,6 +466,14 @@ def main(cfg: DictConfig) -> Trainer:
             mosaicml_logger = MosaicMLLogger()
             loggers.append(mosaicml_logger)
 
+    if metadata is not None:
+        # Flatten the metadata for logging
+        logged_cfg.pop('metadata', None)
+        logged_cfg.update(metadata, merge=True)
+        if mosaicml_logger is not None:
+            mosaicml_logger.log_metrics(metadata)
+            mosaicml_logger._flush_metadata(force_flush=True)
+
     # Profiling
     profiler: Optional[Profiler] = None
     profiler_cfg: Optional[DictConfig] = pop_config(cfg,

From c2f5742d5d15e26b510bead331b35a82258b6c44 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 6 Nov 2023 14:01:42 -0800
Subject: [PATCH 20/49] Run HF dataset processing on local rank 0 first (#716)

---
 llmfoundry/data/finetuning/tasks.py | 40 ++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index edbfcc28c7..3673a48217 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -38,6 +38,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import datasets as hf_datasets
+from composer.utils import dist
 from omegaconf import DictConfig
 from streaming import StreamingDataset
 from transformers import PreTrainedTokenizerBase
@@ -332,6 +333,16 @@ def build_from_hf(
             preprocessing_fn = self.get_preprocessing_fn_from_str(
                 proto_preprocessing_fn, dataset_name)
 
+        signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed'
+
+        # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
+        # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks
+        # can just read them.
+        if dist.get_local_rank() != 0:
+            log.debug('Waiting for local_rank 0 to finish data prep')
+            with dist.local_rank_zero_download_and_wait(signal_file_path):
+                pass
+
         dataset = hf_datasets.load_dataset(dataset_name, split=split, **kwargs)
 
         def dataset_mapper(example: Dict):
@@ -340,7 +351,8 @@ def dataset_mapper(example: Dict):
             return _tokenize_formatted_example(example, tokenizer)
 
         detected_cpu_count = os.cpu_count() or 1
-        num_cpus_to_use = max(1, detected_cpu_count - 4)
+        detected_cpus_with_margin = detected_cpu_count - 8
+        num_cpus_to_use = max(1, detected_cpus_with_margin)
 
         columns_to_remove = list(dataset[0].keys())
         tokenized_dataset = dataset.map(
@@ -348,10 +360,12 @@ def dataset_mapper(example: Dict):
             batched=False,
             remove_columns=columns_to_remove,
             num_proc=num_cpus_to_use,
+            desc='Tokenizing dataset',
         )
         prompt_length_filtered_dataset = tokenized_dataset.filter(
             lambda example: len(example['input_ids']) < max_seq_len,
             num_proc=num_cpus_to_use,
+            desc='Filtering out long prompts',
         )
 
         examples_removed = len(tokenized_dataset) - len(
@@ -361,10 +375,16 @@ def dataset_mapper(example: Dict):
                 f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
             )
 
+        pad_token_id = tokenizer.pad_token_id
         empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
             lambda example: len(example['input_ids']) > 0 and len(example[
-                'labels']) > 0 and any(token_id != tokenizer.pad_token_id
-                                       for token_id in example['labels']))
+                'labels']) > 0 and any(token_id != pad_token_id
+                                       for token_id in example['labels']),
+            num_proc=num_cpus_to_use,
+            desc='Filtering out empty examples')
+
+        log.debug('Done tokenizing and filtering examples.')
+
         empty_examples_removed = len(prompt_length_filtered_dataset) - len(
             empty_examples_dropped_dataset)
         if empty_examples_removed > 0:
@@ -372,6 +392,20 @@ def dataset_mapper(example: Dict):
                 f'Dropped {empty_examples_removed} examples where the prompt or response was empty, '
                 + 'or the response was only padding tokens.')
 
+        # Now local rank 0 indicates to the other ranks that it is done
+        if dist.get_local_rank() == 0:
+            log.debug('Local rank 0 finished data prep')
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_data_prep')
+
+        # All ranks sync up at this barrier, having completed data processing
+        dist.barrier()
+
+        # Last, local rank 0 cleans up the signal file
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+
+        log.debug('All ranks finished data prep')
         return empty_examples_dropped_dataset
 
     def build_from_streaming(self, *args: Any,

From 58d7cf3e3bcbbd21a77c71ff3a37e7be50d46bbe Mon Sep 17 00:00:00 2001
From: Jerry Chen <jerry@mosaicml.com>
Date: Mon, 6 Nov 2023 14:31:27 -0800
Subject: [PATCH 21/49] Add Hugging Face model download script (#708)

* Add Hugging Face model download script

* Decode response bytes to string

* Clean

* Move download functions to foundry utils

* Clean up script

* Add bs4 dependency

* Fix typing

* Doc formatting

* Doc formatting

* Fix weights preference logic

* Unit tests for weights preference logic in download_from_hf_hub

* Unit tests for download_from_cache_server

* Add retries and unit tests

* pyright

* code quality checks

* precommit

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/utils/__init__.py             |   4 +
 llmfoundry/utils/model_download_utils.py | 228 +++++++++++++++++++++
 scripts/misc/download_hf_model.py        |  67 ++++++
 setup.py                                 |   5 +-
 tests/test_model_download_utils.py       | 248 +++++++++++++++++++++++
 5 files changed, 551 insertions(+), 1 deletion(-)
 create mode 100644 llmfoundry/utils/model_download_utils.py
 create mode 100644 scripts/misc/download_hf_model.py
 create mode 100644 tests/test_model_download_utils.py

diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py
index 38cc562c9d..7abe4dcf75 100644
--- a/llmfoundry/utils/__init__.py
+++ b/llmfoundry/utils/__init__.py
@@ -11,6 +11,8 @@
     from llmfoundry.utils.config_utils import (calculate_batch_size_info,
                                                log_config, pop_config,
                                                update_batch_size_info)
+    from llmfoundry.utils.model_download_utils import (
+        download_from_cache_server, download_from_hf_hub)
 except ImportError as e:
     raise ImportError(
         'Please make sure to pip install . to get requirements for llm-foundry.'
@@ -26,6 +28,8 @@
     'build_tokenizer',
     'calculate_batch_size_info',
     'convert_and_save_ft_weights',
+    'download_from_cache_server',
+    'download_from_hf_hub',
     'get_hf_tokenizer_from_composer_state_dict',
     'update_batch_size_info',
     'log_config',
diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
new file mode 100644
index 0000000000..d268cb78b7
--- /dev/null
+++ b/llmfoundry/utils/model_download_utils.py
@@ -0,0 +1,228 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utility functions for downloading models."""
+import copy
+import logging
+import os
+import time
+from http import HTTPStatus
+from typing import Optional
+from urllib.parse import urljoin
+
+import huggingface_hub as hf_hub
+import requests
+import tenacity
+from bs4 import BeautifulSoup
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
+from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
+
+DEFAULT_IGNORE_PATTERNS = [
+    '*.ckpt',
+    '*.h5',
+    '*.msgpack',
+]
+PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*'
+SAFE_WEIGHTS_PATTERN = 'model*.safetensors*'
+
+log = logging.getLogger(__name__)
+
+
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type(
+    (ValueError, hf_hub.utils.RepositoryNotFoundError)),
+                stop=tenacity.stop_after_attempt(3),
+                wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_hf_hub(
+    repo_id: str,
+    save_dir: Optional[str] = None,
+    prefer_safetensors: bool = True,
+    token: Optional[str] = None,
+):
+    """Downloads model files from a Hugging Face Hub model repo.
+
+    Only supports models stored in Safetensors and PyTorch formats for now. If both formats are available, only the
+    Safetensors weights will be downloaded unless `prefer_safetensors` is set to False.
+
+    Args:
+        repo_id (str): The Hugging Face Hub repo ID.
+        save_dir (str, optional): The path to the directory where the model files will be downloaded. If `None`, reads
+            from the `HUGGINGFACE_HUB_CACHE` environment variable or uses the default Hugging Face Hub cache directory.
+        prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are
+            available. Defaults to True.
+        token (str, optional): The HuggingFace API token. If not provided, the token will be read from the
+            `HUGGING_FACE_HUB_TOKEN` environment variable.
+
+    Raises:
+        RepositoryNotFoundError: If the model repo doesn't exist or the token is unauthorized.
+        ValueError: If the model repo doesn't contain any supported model weights.
+    """
+    repo_files = set(hf_hub.list_repo_files(repo_id))
+
+    # Ignore TensorFlow, TensorFlow 2, and Flax weights as they are not supported by Composer.
+    ignore_patterns = copy.deepcopy(DEFAULT_IGNORE_PATTERNS)
+
+    safetensors_available = (SAFE_WEIGHTS_NAME in repo_files or
+                             SAFE_WEIGHTS_INDEX_NAME in repo_files)
+    pytorch_available = (PYTORCH_WEIGHTS_NAME in repo_files or
+                         PYTORCH_WEIGHTS_INDEX_NAME in repo_files)
+
+    if safetensors_available and pytorch_available:
+        if prefer_safetensors:
+            log.info(
+                'Safetensors available and preferred. Excluding pytorch weights.'
+            )
+            ignore_patterns.append(PYTORCH_WEIGHTS_PATTERN)
+        else:
+            log.info(
+                'Pytorch available and preferred. Excluding safetensors weights.'
+            )
+            ignore_patterns.append(SAFE_WEIGHTS_PATTERN)
+    elif safetensors_available:
+        log.info('Only safetensors available. Ignoring weights preference.')
+    elif pytorch_available:
+        log.info('Only pytorch available. Ignoring weights preference.')
+    else:
+        raise ValueError(
+            f'No supported model weights found in repo {repo_id}.' +
+            ' Please make sure the repo contains either safetensors or pytorch weights.'
+        )
+
+    download_start = time.time()
+    hf_hub.snapshot_download(repo_id,
+                             cache_dir=save_dir,
+                             ignore_patterns=ignore_patterns,
+                             token=token)
+    download_duration = time.time() - download_start
+    log.info(
+        f'Downloaded model {repo_id} from Hugging Face Hub in {download_duration} seconds'
+    )
+
+
+def _extract_links_from_html(html: str):
+    """Extracts links from HTML content.
+
+    Args:
+        html (str): The HTML content
+
+    Returns:
+        list[str]: A list of links to download.
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+    links = [a['href'] for a in soup.find_all('a')]
+    return links
+
+
+def _recursive_download(
+    session: requests.Session,
+    base_url: str,
+    path: str,
+    save_dir: str,
+    ignore_cert: bool = False,
+):
+    """Downloads all files/subdirectories from a directory on a remote server.
+
+    Args:
+        session: A requests.Session through which to make requests to the remote server.
+        url (str): The base URL where the files are located.
+        path (str): The path from the base URL to the files to download. The full URL for the download is equal to
+            '<base_url>/<path>'.
+        save_dir (str): The directory to save downloaded files to.
+        ignore_cert (bool): Whether or not to ignore the validity of the SSL certificate of the remote server.
+            Defaults to False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+
+    Raises:
+        PermissionError: If the remote server returns a 401 Unauthorized status code.
+        ValueError: If the remote server returns a 404 Not Found status code.
+        RuntimeError: If the remote server returns a status code other than 200 OK or 401 Unauthorized.
+    """
+    url = urljoin(base_url, path)
+    response = session.get(url, verify=(not ignore_cert))
+
+    if response.status_code == HTTPStatus.UNAUTHORIZED:
+        raise PermissionError(
+            f'Not authorized to download file from {url}. Received status code {response.status_code}. '
+        )
+    elif response.status_code == HTTPStatus.NOT_FOUND:
+        raise ValueError(
+            f'Could not find file at {url}. Received status code {response.status_code}'
+        )
+    elif response.status_code != HTTPStatus.OK:
+        raise RuntimeError(
+            f'Could not download file from {url}. Received unexpected status code {response.status_code}'
+        )
+
+    # Assume that the URL points to a file if it does not end with a slash.
+    if not path.endswith('/'):
+        save_path = os.path.join(save_dir, path)
+        parent_dir = os.path.dirname(save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+
+            log.info(f'Downloaded file {save_path}')
+            return
+
+    # If the URL is a directory, the response should be an HTML directory listing that we can parse for additional links
+    # to download.
+    child_links = _extract_links_from_html(response.content.decode())
+    for child_link in child_links:
+        _recursive_download(session,
+                            base_url,
+                            urljoin(path, child_link),
+                            save_dir,
+                            ignore_cert=ignore_cert)
+
+
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type(
+    (PermissionError, ValueError)),
+                stop=tenacity.stop_after_attempt(3),
+                wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_cache_server(
+    model_name: str,
+    cache_base_url: str,
+    save_dir: str,
+    token: Optional[str] = None,
+    ignore_cert: bool = False,
+):
+    """Downloads Hugging Face models from a mirror file server.
+
+    The file server is expected to store the files in the same structure as the Hugging Face cache
+    structure. See https://huggingface.co/docs/huggingface_hub/guides/manage-cache.
+
+    Args:
+        model_name: The name of the model to download. This should be the same as the repository ID in the Hugging Face
+            Hub.
+        cache_base_url: The base URL of the cache file server. This function will attempt to download all of the blob
+            files from `<cache_base_url>/<formatted_model_name>/blobs/`, where `formatted_model_name` is equal to
+            `models/<model_name>` with all slashes replaced with `--`.
+        save_dir: The directory to save the downloaded files to.
+        token: The Hugging Face API token. If not provided, the token will be read from the `HUGGING_FACE_HUB_TOKEN`
+            environment variable.
+        ignore_cert: Whether or not to ignore the validity of the SSL certificate of the remote server. Defaults to
+            False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+    """
+    formatted_model_name = f'models/{model_name}'.replace('/', '--')
+    with requests.Session() as session:
+        session.headers.update({'Authorization': f'Bearer {token}'})
+
+        download_start = time.time()
+
+        # Only downloads the blobs in order to avoid downloading model files twice due to the
+        # symlnks in the Hugging Face cache structure:
+        _recursive_download(
+            session,
+            cache_base_url,
+            # Trailing slash to indicate directory
+            f'{formatted_model_name}/blobs/',
+            save_dir,
+            ignore_cert=ignore_cert,
+        )
+        download_duration = time.time() - download_start
+        log.info(
+            f'Downloaded model {model_name} from cache server in {download_duration} seconds'
+        )
diff --git a/scripts/misc/download_hf_model.py b/scripts/misc/download_hf_model.py
new file mode 100644
index 0000000000..6465a552c2
--- /dev/null
+++ b/scripts/misc/download_hf_model.py
@@ -0,0 +1,67 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Script to download model weights from Hugging Face Hub or a cache server."""
+import argparse
+import logging
+import os
+import sys
+
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+
+from llmfoundry.utils.model_download_utils import (download_from_cache_server,
+                                                   download_from_hf_hub)
+
+HF_TOKEN_ENV_VAR = 'HUGGING_FACE_HUB_TOKEN'
+
+log = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('--model', type=str, required=True)
+    argparser.add_argument('--download-from',
+                           type=str,
+                           choices=['hf', 'cache'],
+                           default='hf')
+    argparser.add_argument('--token',
+                           type=str,
+                           default=os.getenv(HF_TOKEN_ENV_VAR))
+    argparser.add_argument('--save-dir',
+                           type=str,
+                           default=HUGGINGFACE_HUB_CACHE)
+    argparser.add_argument('--cache-url', type=str, default=None)
+    argparser.add_argument('--ignore-cert', action='store_true', default=False)
+    argparser.add_argument(
+        '--fallback',
+        action='store_true',
+        default=False,
+        help=
+        'Whether to fallback to downloading from Hugging Face if download from cache fails',
+    )
+
+    args = argparser.parse_args(sys.argv[1:])
+    if args.download_from == 'hf':
+        download_from_hf_hub(args.model,
+                             save_dir=args.save_dir,
+                             token=args.token)
+    else:
+        try:
+            download_from_cache_server(
+                args.model,
+                args.cache_url,
+                args.save_dir,
+                token=args.token,
+                ignore_cert=args.ignore_cert,
+            )
+        except PermissionError:
+            log.error(f'Not authorized to download {args.model}.')
+        except Exception as e:
+            if args.fallback:
+                log.warn(
+                    f'Failed to download {args.model} from cache server. Falling back to Hugging Face Hub. Error: {e}'
+                )
+                download_from_hf_hub(args.model,
+                                     save_dir=args.save_dir,
+                                     token=args.token)
+            else:
+                raise e
diff --git a/setup.py b/setup.py
index 63aac9d752..f528838d35 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,8 @@
     'triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python',
     'boto3>=1.21.45,<2',
     'huggingface-hub>=0.17.0,<1.0',
+    'beautifulsoup4>=4.12.2,<5',  # required for model download utils
+    'tenacity>=8.2.3,<9',
 ]
 
 extra_deps = {}
@@ -101,7 +103,8 @@
 extra_deps['peft'] = [
     'loralib==0.1.1',  # lora core
     'bitsandbytes==0.39.1',  # 8bit
-    'scipy>=1.10.0,<=1.11.0',  # bitsandbytes dependency; TODO: eliminate when incorporated to bitsandbytes
+    # bitsandbytes dependency; TODO: eliminate when incorporated to bitsandbytes
+    'scipy>=1.10.0,<=1.11.0',
     # TODO: pin peft when it stabilizes.
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
     'peft==0.4.0',
diff --git a/tests/test_model_download_utils.py b/tests/test_model_download_utils.py
new file mode 100644
index 0000000000..27b9805cda
--- /dev/null
+++ b/tests/test_model_download_utils.py
@@ -0,0 +1,248 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import unittest.mock as mock
+from http import HTTPStatus
+from typing import Any, Dict, List
+from unittest.mock import MagicMock
+from urllib.parse import urljoin
+
+import pytest
+import requests
+import tenacity
+from huggingface_hub.utils import RepositoryNotFoundError
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
+from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
+
+from llmfoundry.utils.model_download_utils import (DEFAULT_IGNORE_PATTERNS,
+                                                   PYTORCH_WEIGHTS_PATTERN,
+                                                   SAFE_WEIGHTS_PATTERN,
+                                                   download_from_cache_server,
+                                                   download_from_hf_hub)
+
+# ======================== download_from_hf_hub tests ========================
+
+
+@pytest.mark.parametrize(
+    ['prefer_safetensors', 'repo_files', 'expected_ignore_patterns'],
+    [
+        [  # Should use default ignore if only safetensors available
+            True,
+            [SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only safetensors available
+            False,
+            [SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [  # Should use default ignore if only sharded safetensors available
+            True,
+            [SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded safetensors available
+            False,
+            [SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only pytorch available
+            True,
+            [PYTORCH_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only pytorch available
+            False,
+            [PYTORCH_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded pytorch available
+            True,
+            [PYTORCH_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded pytorch available
+            False,
+            [PYTORCH_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [  # Ignore pytorch if safetensors are preferred
+            True,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [PYTORCH_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore safetensors if pytorch is preferred
+            False,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [SAFE_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore pytorch if safetensors are preferred
+            True,
+            [PYTORCH_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS + [PYTORCH_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore safetensors if pytorch is preferred
+            False,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [SAFE_WEIGHTS_PATTERN],
+        ],
+    ])
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock,
+                                           mock_snapshot_download: MagicMock,
+                                           prefer_safetensors: bool,
+                                           repo_files: List[str],
+                                           expected_ignore_patterns: List[str]):
+    test_repo_id = 'test_repo_id'
+    mock_list_repo_files.return_value = repo_files
+
+    download_from_hf_hub(test_repo_id, prefer_safetensors=prefer_safetensors)
+    mock_snapshot_download.assert_called_once_with(
+        test_repo_id,
+        cache_dir=None,
+        ignore_patterns=expected_ignore_patterns,
+        token=None,
+    )
+
+
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_no_weights(
+    mock_list_repo_files: MagicMock,
+    mock_snapshot_download: MagicMock,
+):
+    test_repo_id = 'test_repo_id'
+    mock_list_repo_files.return_value = []
+
+    with pytest.raises(ValueError):
+        download_from_hf_hub(test_repo_id)
+
+    mock_snapshot_download.assert_not_called()
+
+
+@pytest.mark.parametrize(['exception', 'expected_attempts'], [
+    [requests.exceptions.RequestException(), 3],
+    [RepositoryNotFoundError(''), 1],
+    [ValueError(), 1],
+])
+@mock.patch('tenacity.nap.time.sleep')
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_retry(
+    mock_list_repo_files: MagicMock,
+    mock_snapshot_download: MagicMock,
+    mock_sleep: MagicMock,  # so the retry wait doesn't actually wait
+    exception: BaseException,
+    expected_attempts: int,
+):
+    mock_list_repo_files.return_value = [SAFE_WEIGHTS_INDEX_NAME]
+    mock_snapshot_download.side_effect = exception
+
+    with pytest.raises((tenacity.RetryError, exception.__class__)):
+        download_from_hf_hub('test_repo_id')
+
+    assert mock_snapshot_download.call_count == expected_attempts
+
+
+# ======================== download_from_cache_server tests ========================
+
+ROOT_HTML = b"""
+<!DOCTYPE html>
+<html>
+<body>
+    <ul>
+        <li><a href="file1">file1</a></li>
+        <li><a href="folder/">folder/</a></li>
+    </ul>
+</body>
+</html>
+"""
+
+SUBFOLDER_HTML = b"""
+<!DOCTYPE html>
+<html>
+<body>
+    <ul>
+        <li><a href="file2">file2</a></li>
+    </ul>
+</body>
+</html>
+"""
+
+
+@mock.patch.object(requests.Session, 'get')
+@mock.patch('os.makedirs')
+@mock.patch('builtins.open')
+def test_download_from_cache_server(mock_open: MagicMock,
+                                    mock_makedirs: MagicMock,
+                                    mock_get: MagicMock):
+    cache_url = 'https://cache.com/'
+    model_name = 'model'
+    formatted_model_name = 'models--model'
+    save_dir = 'save_dir/'
+
+    mock_open.return_value = MagicMock()
+
+    def _server_response(url: str, **kwargs: Dict[str, Any]):
+        if url == urljoin(cache_url, f'{formatted_model_name}/blobs/'):
+            return MagicMock(status_code=HTTPStatus.OK, content=ROOT_HTML)
+        if url == urljoin(cache_url, f'{formatted_model_name}/blobs/file1'):
+            return MagicMock(status_code=HTTPStatus.OK)
+        elif url == urljoin(cache_url, f'{formatted_model_name}/blobs/folder/'):
+            return MagicMock(status_code=HTTPStatus.OK, content=SUBFOLDER_HTML)
+        elif url == urljoin(cache_url,
+                            f'{formatted_model_name}/blobs/folder/file2'):
+            return MagicMock(status_code=HTTPStatus.OK)
+        else:
+            return MagicMock(status_code=HTTPStatus.NOT_FOUND)
+
+    mock_get.side_effect = _server_response
+    download_from_cache_server(model_name, cache_url, 'save_dir/')
+
+    mock_open.assert_has_calls([
+        mock.call(os.path.join(save_dir, formatted_model_name, 'blobs/file1'),
+                  'wb'),
+        mock.call(
+            os.path.join(save_dir, formatted_model_name, 'blobs/folder/file2'),
+            'wb'),
+    ],
+                               any_order=True)
+
+
+@mock.patch.object(requests.Session, 'get')
+def test_download_from_cache_server_unauthorized(mock_get: MagicMock):
+    cache_url = 'https://cache.com/'
+    model_name = 'model'
+    save_dir = 'save_dir/'
+
+    mock_get.return_value = MagicMock(status_code=HTTPStatus.UNAUTHORIZED)
+    with pytest.raises(PermissionError):
+        download_from_cache_server(model_name, cache_url, save_dir)
+
+
+@pytest.mark.parametrize(['exception', 'expected_attempts'], [
+    [requests.exceptions.RequestException(), 3],
+    [PermissionError(), 1],
+    [ValueError(), 1],
+])
+@mock.patch('tenacity.nap.time.sleep')
+@mock.patch('llmfoundry.utils.model_download_utils._recursive_download')
+def test_download_from_cache_server_retry(
+    mock_recursive_download: MagicMock,
+    mock_sleep: MagicMock,  # so the retry wait doesn't actually wait
+    exception: BaseException,
+    expected_attempts: int,
+):
+    mock_recursive_download.side_effect = exception
+
+    with pytest.raises((tenacity.RetryError, exception.__class__)):
+        download_from_cache_server('model', 'cache_url', 'save_dir')

From 1d504c851c26d54e8a07b2a2245fd6cbd4d283e0 Mon Sep 17 00:00:00 2001
From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com>
Date: Mon, 6 Nov 2023 15:00:19 -0800
Subject: [PATCH 22/49] Adding support for Rotary Position Embeddings (#675)

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* removed the roformer impementation of rope

* ..

* fixed all the lint errors

* ..

* ..

* ../llmfoundry/models/mpt/modeling_mpt.py

* ..

* ..

* ..

* added unit test to test rotary embeddings

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* Update llmfoundry/models/mpt/modeling_mpt.py

Accepting the suggestion

Co-authored-by: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>

* incorporated some suggestions from the pr

* ..

* ..

* ..

* ..

* ..

* ..

* ..

* added mark for gpu in the rotary embedding test

* ..

* ..

* ..

* removed thecode for hf implementation of rope

* ..

* ..

* added tests

* ..

* ..

* ...

* ..

* ..

* ..

* ..

* ..

* fixed the tests after the merge

* minor change

* Fixed some tests failing due to a transformers library bug

* added check for flash_attention before importing their rotary embedding

* added check for flash_attention in tests before using dail rope

* fixed tests

* ..

* ..

* temporary fix

* ..

* ..

* fixed a test

* ..

* minor change

* minor changes

* added documentation

* added documentation

* temp commit

* made _set_config_defaults recursive

* minor changes

* reformatted tutorial table

* reformatted tutorial table

* reformatted tutorial table

* added documentation on how to install flash attention 2

* minor changes

* minor changes

* minor changes

* minor changes

* minor changes

* minor changes

* ..

* resolved some comments from the PR

* fixed tests

* modified is_flash_v2_installed

* minor changes

* Update TUTORIAL.md

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Update TUTORIAL.md

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Update TUTORIAL.md

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Update TUTORIAL.md

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* resolved PR comments

---------

Co-authored-by: Shashank Rajput <ashank.rajput@databricks.com>
Co-authored-by: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 TUTORIAL.md                                |  49 +-
 llmfoundry/models/layers/attention.py      |  71 ++-
 llmfoundry/models/layers/blocks.py         |  43 +-
 llmfoundry/models/mpt/configuration_mpt.py |  72 ++-
 llmfoundry/models/mpt/modeling_mpt.py      | 129 ++++-
 tests/test_flash_triton_torch.py           |  73 ++-
 tests/test_model.py                        | 557 +++++++++++++++++----
 tests/test_rope_dail_vs_hf.py              | 145 ++++++
 8 files changed, 952 insertions(+), 187 deletions(-)
 create mode 100644 tests/test_rope_dail_vs_hf.py

diff --git a/TUTORIAL.md b/TUTORIAL.md
index d019eb9f83..86bd9829e9 100644
--- a/TUTORIAL.md
+++ b/TUTORIAL.md
@@ -8,27 +8,42 @@ Forging LLMs can be quite complicated — you have to get your data prepared, se
 
 This tutorial will provide a brief intro to the repo’s structure and underlying tools (all courtesy of MosaicML, of course), will go over a few example workflows and point you to the related resources within the repo, and will finally cover a number of FAQs that we have encountered since release.
 
+- [LLM Foundry Tutorial](#llm-foundry-tutorial)
 - [Intro](#intro)
   - [How this repo is structured](#how-this-repo-is-structured)
   - [Key components](#key-components)
+    - [Composer](#composer)
+    - [StreamingDataset](#streamingdataset)
+    - [MCLI](#mcli)
   - [How the YAMLs work](#how-the-yamls-work)
 - [Example Workflows](#example-workflows)
   - [Workflow 1: I want to play with a HF model like MPT-7B locally](#workflow-1-i-want-to-play-with-a-hf-model-like-mpt-7b-locally)
   - [Workflow 2: I want to deploy an inference endpoint with a HF model like MPT-7B](#workflow-2-i-want-to-deploy-an-inference-endpoint-with-a-hf-model-like-mpt-7b)
   - [Workflow 3: I want to finetune a HF model like MPT-7B](#workflow-3-i-want-to-finetune-a-hf-model-like-mpt-7b)
+    - [Supervised FineTuning and Instruction FineTuning](#supervised-finetuning-and-instruction-finetuning)
+    - [Domain Adaptation and Sequence Length Adaptation](#domain-adaptation-and-sequence-length-adaptation)
+      - [Data](#data)
+      - [Modeling](#modeling)
   - [Workflow 4: I want to train a new HF model from scratch](#workflow-4-i-want-to-train-a-new-hf-model-from-scratch)
 - [FAQs](#faqs)
-  - [Why is the script only using 1 out of N GPUs?](#why-is-the-script-only-using-1-out-of-n-gpus)
-  - [I’m running into an Out-Of-Memory (OOM) error. What do I do?](#im-running-into-an-out-of-memory-oom-error-what-do-i-do)
-  - [What hardware can I train on?](#what-hardware-can-i-train-on)
-  - [What hardware can I run eval on?](#what-hardware-can-i-run-eval-on)
-  - [What is FSDP?](#what-is-fsdp)
-  - [What are the different attention options `torch` / `flash` / `triton`  for MPT and which one should I use?](#what-are-the-different-attention-options-torch--flash--triton-for-mpt-and-which-one-should-i-use)
-  - [Can I finetune using PEFT / LORA?](#can-i-finetune-using-peft--lora)
-  - [Can I quantize these models and/or run on CPU?](#can-i-quantize-these-models-andor-run-on-cpu)
-  - [How do I deploy with ONNX/FasterTransformer?](#how-do-i-deploy-with-onnxfastertransformer)
-  - [How expensive is it to build LLMs?](#how-expensive-is-it-to-build-llms)
-  - [Common installation issues](#common-installation-issues)
+    - [Why is the script only using 1 out of N GPUs?](#why-is-the-script-only-using-1-out-of-n-gpus)
+    - [I’m running into an Out-Of-Memory (OOM) error. What do I do?](#im-running-into-an-out-of-memory-oom-error-what-do-i-do)
+    - [What hardware can I train on?](#what-hardware-can-i-train-on)
+    - [What hardware can I run eval on?](#what-hardware-can-i-run-eval-on)
+    - [What hardware can I run inference on?](#what-hardware-can-i-run-inference-on)
+    - [What is FSDP?](#what-is-fsdp)
+    - [What are the different attention options `torch` / `flash` / `triton`  for MPT and which one should I use?](#what-are-the-different-attention-options-torch--flash--triton--for-mpt-and-which-one-should-i-use)
+      - [Limitations](#limitations)
+      - [What is `triton-pre-mlir`?](#what-is-triton-pre-mlir)
+      - [Known issue with sm86+ GPUs](#known-issue-with-sm86-gpus)
+      - [Support for FlashAttention-2](#support-for-flashattention-2)
+    - [What kinds of positional embeddings does LLM Foundry support?](#what-kinds-of-positional-embeddings-does-llm-foundry-support)
+    - [Can I finetune using PEFT / LoRA?](#can-i-finetune-using-peft--lora)
+    - [Can I quantize these models and/or run on CPU?](#can-i-quantize-these-models-andor-run-on-cpu)
+    - [How do I deploy with ONNX/FasterTransformer?](#how-do-i-deploy-with-onnxfastertransformer)
+    - [TransformerEngine and amp\_fp8 support](#transformerengine-and-amp_fp8-support)
+    - [How expensive is it to build LLMs?](#how-expensive-is-it-to-build-llms)
+    - [Common installation issues](#common-installation-issues)
 
 Let’s get started!
 
@@ -328,6 +343,18 @@ The majority of our training setups use `triton`. -->
   Updating to LLVM14 (or LLVM15) cannot be done because there are breaking changes.
   What is the result of this? Although sm89+ is not **formally** supported until LLVM15, our testing on H100 GPUs shows that `attn_impl=triton` still works well and still runs fast. The only issue is that when the network is starting to run, LLVM might throw a warning like: `'sm_90' is not a recognized processor for this target (ignoring processor)`. This warning does not seem to affect performance.
 
+#### Support for FlashAttention-2
+- [FlashAttention-2](https://arxiv.org/pdf/2307.08691.pdf) improves upon FlashAttention to get even faster attention computation. LLM Foundry supports FlashAttention-2. Please follow the instructions [here](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#flashattention).
+
+### What kinds of positional embeddings does LLM Foundry support?
+Currently we support [Learned Positional Embeddings](https://arxiv.org/pdf/1706.03762.pdf), [Attention with Linear Biases (ALiBi)](https://arxiv.org/pdf/2108.12409.pdf), and [Rotary Positional Embeddings (RoPE)](https://arxiv.org/pdf/2104.09864.pdf). There is also an option to switch off all of these embeddings to get [No Positional Embedding](https://arxiv.org/pdf/2203.16634.pdf).
+
+| Name                               | YAML Config                                                       | Training MFU on MPT-7B trained on 8 A100 80GB GPUs | Notes                                                                                                                                                                       |
+|:-----------------------------------|:------------------------------------------------------------------|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Learned Positional Embeddings      | <pre>model:<br>     learned_pos_emb:&nbsp;True</pre>| 65.7                                                |                                                                                                                                                                             |
+| ALiBi                              | <pre>model:<br>     attn_config:<br>         alibi:&nbsp;True</pre>| 64.5                                                |  Requires Triton or Torch attention.                                                                                                                                        |
+| RoPE (Dao-AILab Implementation)    | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;dail</pre>| 64.5                                                | Requires a CUDA GPU and the [flash-attn library](https://github.com/Dao-AILab/flash-attention) v2.0.1 or higher to be installed. Please see the instructions in the [paragraph above](#support-for-flashattention-2) on how to install flash-attn v2. Note that the attention implementation can still be `torch`, `triton`, or `flash`. |
+| RoPE (Hugging<code>&nbsp;</code>Face Implementation)  | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;hf</pre>| 62.3                                                |                                                                                                                                                                             |
 
 ### Can I finetune using PEFT / LoRA?
 - The LLM Foundry codebase does not directly have examples of PEFT or LORA workflows. However, our MPT model is a subclass of HuggingFace `PretrainedModel`, and https://github.com/mosaicml/llm-foundry/pull/346 added required features to enable HuggingFace’s [PEFT](https://huggingface.co/docs/peft/index) / [LORA](https://huggingface.co/docs/peft/conceptual_guides/lora) workflows for MPT. MPT models with LoRA modules can be trained either using LLM Foundry or Hugging Face's [accelerate](https://huggingface.co/docs/accelerate/index). Within LLM Foundry, run (`scripts/train/train.py`), adding `lora` arguments to the config `.yaml`, like so:
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index 39fa7162ac..0503d6d75a 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -5,7 +5,7 @@
 
 import math
 import warnings
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
@@ -17,12 +17,13 @@
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
 
-def is_flash_v2_installed():
+def is_flash_v2_installed(v2_version: str = '2.0.0'):
+    assert version.parse(v2_version) >= version.parse('2.0.0')
     try:
         import flash_attn as flash_attn
     except:
         return False
-    return version.parse(flash_attn.__version__) >= version.parse('2.0.0')
+    return version.parse(flash_attn.__version__) >= version.parse(v2_version)
 
 
 def is_flash_v1_installed():
@@ -33,6 +34,16 @@ def is_flash_v1_installed():
     return version.parse(flash_attn.__version__) < version.parse('2.0.0')
 
 
+# Before importing any transformers models, we need to disable transformers flash attention if
+# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+# gated import otherwise.
+if is_flash_v1_installed():
+    import transformers
+    transformers.utils.is_flash_attn_available = lambda: False
+
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
                      original_is_causal: bool) -> bool:
     # disable causal when it is not needed
@@ -70,7 +81,7 @@ def scaled_multihead_dot_product_attention(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -79,7 +90,7 @@ def scaled_multihead_dot_product_attention(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
 
     if multiquery:
@@ -183,7 +194,7 @@ def scaled_multihead_dot_product_attention(
 
 
 def check_valid_inputs(*tensors: torch.Tensor,
-                       valid_dtypes: Optional[List[torch.dtype]] = None):
+                       valid_dtypes: Optional[list[torch.dtype]] = None):
     if valid_dtypes is None:
         valid_dtypes = [torch.float16, torch.bfloat16]
     for tensor in tensors:
@@ -199,7 +210,7 @@ def flash_attn_fn(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -208,7 +219,7 @@ def flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
         from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
@@ -337,7 +348,7 @@ def triton_flash_attn_fn(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -346,7 +357,7 @@ def triton_flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
         from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
@@ -552,12 +563,13 @@ def __init__(
     def forward(
         self,
         x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         attn_bias: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb_w_meta_info: Optional[dict] = None,
         is_causal: bool = True,
         needs_weights: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[
             torch.Tensor, torch.Tensor]]]:
         qkv = self.Wqkv(x)
 
@@ -581,6 +593,39 @@ def forward(
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
 
+        if rotary_emb_w_meta_info is not None:
+            rotary_emb = rotary_emb_w_meta_info['rotary_emb']
+            seq_len = rotary_emb_w_meta_info['seq_len']
+            offset_info = rotary_emb_w_meta_info['offset_info']
+            bsz, seqlen = query.shape[:2]
+            query = query.view(bsz, seqlen, -1, self.head_dim)
+            key = key.view(bsz, seqlen, -1, self.head_dim)
+
+            if rotary_emb_w_meta_info['impl'] == 'dail':
+                value = value.view(bsz, seqlen, -1, self.head_dim)
+
+                kv = torch.stack([key, value], dim=2)
+                query, kv = rotary_emb(query,
+                                       kv,
+                                       seqlen_offset=offset_info,
+                                       max_seqlen=seq_len)
+                [key, value] = torch.unbind(kv, dim=2)
+
+                value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+            elif rotary_emb_w_meta_info['impl'] == 'hf':
+                (cos, sin) = rotary_emb(value, seq_len)
+                # The following two transposes should be removed once the transformers library allows for the specification of the dimension for heads in the call to apply_rotary_pos_emb
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+                query, key = apply_rotary_pos_emb(query, key, cos, sin,
+                                                  offset_info)
+                # The following two transposes should be removed once the transformers library allows for the specification of the dimension for heads in the call to apply_rotary_pos_emb
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+
+            query = query.view(bsz, seqlen, self.d_model)
+            key = key.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+
         context, attn_weights, past_key_value = self.attn_fn(
             query,
             key,
@@ -677,7 +722,7 @@ def __init__(
 def attn_bias_shape(
         attn_impl: str, n_heads: int, seq_len: int, alibi: bool,
         prefix_lm: bool, causal: bool,
-        use_sequence_id: bool) -> Optional[Tuple[int, int, int, int]]:
+        use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
     if attn_impl == 'flash':
         return None
     elif attn_impl in ['torch', 'triton']:
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index a08ef6d77f..6605807c6b 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -12,6 +12,31 @@
 from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, build_ffn
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
+attn_config_defaults: Dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'triton',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': False,
+    'alibi_bias_max': 8,
+    'rope': False,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}
+
 
 class MPTBlock(nn.Module):
 
@@ -30,18 +55,7 @@ def __init__(
         **kwargs: Any,
     ):
         if attn_config is None:
-            attn_config = {
-                'attn_type': 'multihead_attention',
-                'attn_pdrop': 0.0,
-                'attn_impl': 'triton',
-                'qk_ln': False,
-                'clip_qkv': None,
-                'softmax_scale': None,
-                'prefix_lm': False,
-                'attn_uses_sequence_id': False,
-                'alibi': False,
-                'alibi_bias_max': 8,
-            }
+            attn_config = attn_config_defaults
 
         if ffn_config is None:
             ffn_config = {
@@ -58,7 +72,8 @@ def __init__(
         # necessary to avoid passing extraneous args into attn_class while allowing the use of **kwargs
         args_to_exclude_in_attn_class = {
             'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id',
-            'alibi_bias_max'
+            'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl',
+            'rope_dail_config', 'rope_hf_config'
         }
         attn_config_subset_for_attn_class = {
             k: v
@@ -94,6 +109,7 @@ def forward(
         x: torch.Tensor,
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attn_bias: Optional[torch.Tensor] = None,
+        rotary_emb_w_meta_info: Optional[Dict] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
         is_causal: bool = True,
         output_attentions: bool = False,
@@ -104,6 +120,7 @@ def forward(
             a,
             past_key_value=past_key_value,
             attn_bias=attn_bias,
+            rotary_emb_w_meta_info=rotary_emb_w_meta_info,
             attention_mask=attention_mask,
             is_causal=is_causal,
             needs_weights=output_attentions,
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 251e4f5caf..c4ca68d733 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -8,18 +8,16 @@
 
 from transformers import PretrainedConfig
 
-attn_config_defaults: Dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8,
-}
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.layers.blocks import attn_config_defaults
+
+# NOTE: All utils are imported directly even if unused so that
+# HuggingFace can detect all the needed files to copy into its modules folder.
+# Otherwise, certain modules are missing.
+# isort: off
+from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY  # type: ignore (see note)
+from llmfoundry.models.layers.norm import LPLayerNorm  # type: ignore (see note)
+from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY  # type: ignore (see note)
 
 ffn_config_defaults: Dict = {
     'ffn_type': 'mptmlp',
@@ -94,6 +92,16 @@ def __init__(
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
+                rope (bool): Whether to use rotary positional embeddings.
+                rope_theta (int): The base frequency for rope.
+                rope_impl (str): The implementation of rope to use. One of 'hf' (to use the implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) or 'dail' (to use the implementation from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py).
+                rope_dail_config (Dict): The configuration for the dail implementation of rope.
+                    type (str): The type of rotary position embedding to use. Options: 'original' (for https://arxiv.org/pdf/2104.09864.pdf), 'xpos' (for https://arxiv.org/pdf/2212.10554.pdf).
+                    pos_idx_in_fp32 (bool): If True, the position indices [0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. A consequence could be, for example, that bf16 rounds position 1995 to 2000, which leads to them having the same positional embedding.
+                    xpos_scale_base (float): The scale base for XPos (if using XPos).
+                rope_hf_config (Dict): A dictionary used to configure rope's scaling behavior (when scaling beyond the training length).
+                    type (str): Can be one of 'no_scaling', 'linear', or 'dynamic'. 'no_scaling' uses the default implementation for rotary embeddings, 'linear' uses linear scaling as proposed by the Reddit user /u/kaiokendev, and 'dynamic' uses Dynamic NTK scaling as proposed by the Reddit users /u/bloc97 and /u/emozilla.
+                    factor (float): Scaling factor to use if using 'linear' or 'dynamic' as rope_scaling.type.
                 kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
             ffn_config (Dict): A dictionary used to configure the model's ffn module:
                 ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
@@ -150,10 +158,12 @@ def __init__(
             del kwargs['name']
         if 'loss_fn' in kwargs:
             del kwargs['loss_fn']
-        if self.attn_config.get('alibi', False):
+        if self.attn_config.get('alibi', False) or self.attn_config.get(
+                'rope', False):
             self.learned_pos_emb = False
             warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to `False.`')
+                f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`'
+            )
         super().__init__(**kwargs)
 
         self._validate_config()
@@ -164,6 +174,10 @@ def _set_config_defaults(self, config: Dict[str, Any],
         for k, v in config_defaults.items():
             if k not in config:
                 config[k] = v
+            elif isinstance(v, dict):
+                # recursively set default values for any sub-dicts
+                config[k] = self._set_config_defaults(
+                    config[k] if (config[k] is not None) else {}, v)
         return config
 
     def _validate_config(self) -> None:
@@ -206,6 +220,31 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'attn_uses_sequence_id only implemented with torch and triton attention.'
             )
+        if self.attn_config['rope'] and (self.attn_config['rope_impl']
+                                         not in ['dail', 'hf']):
+            raise ValueError(
+                'If rope is being used then rope_impl should be either "dail", or "hf".'
+            )
+        if self.attn_config['rope'] and (
+                self.attn_config['rope_impl']
+                == 'hf') and self.attn_config['rope_hf_config']['type'] not in [
+                    'no_scaling', 'linear', 'dynamic'
+                ]:
+            raise ValueError(
+                'If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".'
+            )
+        if self.attn_config['rope'] and (self.attn_config['rope_impl']
+                                         == 'dail'):
+            if self.attn_config['rope_dail_config']['type'] not in [
+                    'original', 'xpos'
+            ]:
+                raise ValueError(
+                    'If using the dail implementation of rope, the type should be one of "original" or "xpos".'
+                )
+            if not is_flash_v2_installed(v2_version='2.0.1'):
+                raise ImportError(
+                    'If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support'
+                )
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
                 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
@@ -217,9 +256,10 @@ def _validate_config(self) -> None:
             )
         if self.init_config.get('name', None) is None:
             raise ValueError(f"{self.init_config=} 'name' needs to be set.")
-        if not self.learned_pos_emb and not self.attn_config['alibi']:
+        if not (self.learned_pos_emb or self.attn_config['alibi'] or
+                self.attn_config['rope']):
             warnings.warn(
-                f'Positional information not being provided to the model using either learned_pos_emb or alibi.'
+                f'Positional information not being provided to the model using either learned_pos_emb or alibi or rope.'
             )
         if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
             try:
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 4f4581b177..0cb3ebd56c 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -23,11 +23,27 @@
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import HuggingFaceModel
 from composer.utils import dist
+
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+
+if is_flash_v2_installed():
+    try:  # This try...except is needed because transformers requires it despite the 'if' statement above
+        from flash_attn.layers.rotary import \
+            RotaryEmbedding as DAILRotaryEmbedding
+    except Exception as e:
+        raise e
+
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.modeling_outputs import (BaseModelOutputWithPast,
                                            CausalLMOutputWithPast)
+from transformers.models.llama.modeling_llama import \
+    LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import \
+    LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import \
+    LlamaRotaryEmbedding as HFRotaryEmbedding
 
 from llmfoundry.models.layers.attention import attn_bias_shape, build_attn_bias
 from llmfoundry.models.layers.blocks import MPTBlock
@@ -70,6 +86,50 @@
 log = logging.getLogger(__name__)
 
 
+def gen_rotary_embedding(rope_head_dim: int, rope_impl: str, rope_theta: int,
+                         rope_dail_config: dict, rope_hf_config: dict,
+                         max_seq_len: int):
+    if rope_impl == 'dail':
+        return DAILRotaryEmbedding(
+            dim=rope_head_dim,
+            base=rope_theta,
+            interleaved=False,
+            scale_base=rope_dail_config['xpos_scale_base'] if
+            (rope_dail_config['type'] == 'xpos') else None,
+            pos_idx_in_fp32=rope_dail_config['pos_idx_in_fp32'],
+            device=
+            'cpu',  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+        )
+    elif rope_impl == 'hf':
+        if rope_hf_config['type'] == 'no_scaling':
+            return HFRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+        elif rope_hf_config['type'] == 'linear':
+            return HFLinearScalingRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                scaling_factor=rope_hf_config['factor'],
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+        elif rope_hf_config['type'] == 'dynamic':
+            return HFDynamicNTKScalingRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                scaling_factor=rope_hf_config['factor'],
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+    raise ValueError('rope_impl needs to be either dail or hf')
+
+
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
@@ -123,6 +183,18 @@ def __init__(self, config: MPTConfig):
         ])
         self.norm_f = norm_class(config.d_model, device=config.init_device)
 
+        self.rope = config.attn_config['rope']
+        self.rope_impl = None
+        if self.rope:
+            self.rope_impl = config.attn_config['rope_impl']
+            self.rotary_embedding = gen_rotary_embedding(
+                rope_head_dim=config.d_model // config.n_heads,
+                rope_impl=self.rope_impl,
+                rope_theta=config.attn_config['rope_theta'],
+                rope_dail_config=config.attn_config['rope_dail_config'],
+                rope_hf_config=config.attn_config['rope_hf_config'],
+                max_seq_len=self.config.max_seq_len)
+
         if config.init_device != 'meta':
             log.info(
                 f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.'
@@ -361,8 +433,9 @@ def forward(
             S <= self.config.max_seq_len
         ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
 
-        tok_emb = self.wte(input_ids)
-        if self.learned_pos_emb:
+        rotary_emb_w_meta_info = None
+        x = self.wte(input_ids)
+        if self.learned_pos_emb or self.rope:
             past_position = 0
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
@@ -378,31 +451,44 @@ def forward(
                 if self.attn_impl == 'torch':
                     past_position = past_key_values[0][0].size(3)
 
-            if S + past_position > self.config.max_seq_len:
+            if self.learned_pos_emb and (S + past_position >
+                                         self.config.max_seq_len):
                 raise ValueError(
                     f'Cannot forward input with past sequence length {past_position} and current sequence length '
                     +
                     f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
                 )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(
-                    pos - torch.cumsum((~attention_mask).to(torch.int32),
-                                       dim=1)[:, past_position:],
-                    min=0,
-                )
 
-            pos_emb = self.wpe(pos)
-            x = tok_emb + pos_emb
-        else:
-            # ALiBi and NoPE use this path (RoPE will also use this path if / when enabled)
-            x = tok_emb
+            if self.learned_pos_emb or (self.rope and self.rope_impl == 'hf'):
+                pos = torch.arange(
+                    past_position,
+                    S + past_position,
+                    dtype=torch.long,
+                    device=input_ids.device,
+                ).unsqueeze(0)
+                if attention_mask is not None:
+                    # adjust the position indices to account for padding tokens
+                    pos = torch.clamp(
+                        pos - torch.cumsum((~attention_mask).to(torch.int32),
+                                           dim=1)[:, past_position:],
+                        min=0,
+                    )
+                if self.learned_pos_emb:
+                    x = x + self.wpe(pos)
+                elif self.rope and self.rope_impl == 'hf':
+                    rotary_emb_w_meta_info = {
+                        'impl': self.rope_impl,
+                        'rotary_emb': self.rotary_embedding,
+                        'offset_info': pos,
+                        'seq_len': S + past_position,
+                    }
+            elif self.rope and self.rope_impl == 'dail':
+                rotary_emb_w_meta_info = {
+                    'impl': self.rope_impl,
+                    'rotary_emb': self.rotary_embedding,
+                    'offset_info': past_position,
+                    'seq_len': S + past_position,
+                }
 
         if self.embedding_fraction == 1:
             x = self.emb_drop(x)
@@ -439,6 +525,7 @@ def forward(
                 x,
                 past_key_value=past_key_value,
                 attn_bias=attn_bias,
+                rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                 attention_mask=attention_mask,
                 is_causal=self.is_causal,
                 output_attentions=bool(output_attentions),
diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index e6fe8eb438..3f2c229d6d 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -5,6 +5,9 @@
 import torch
 from omegaconf import OmegaConf as om
 
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
+
 
 def allclose_helper(t0: torch.Tensor,
                     t1: torch.Tensor,
@@ -18,7 +21,32 @@ def allclose_helper(t0: torch.Tensor,
 @pytest.mark.parametrize('attn_impl_1', ['flash', 'triton', 'torch'])
 @pytest.mark.parametrize('clip_qkv', [True, False])
 @pytest.mark.parametrize('qk_ln', [True, False])
-@pytest.mark.parametrize('alibi', [True, False])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
 @pytest.mark.parametrize(
     'attn_type',
     ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
@@ -26,18 +54,24 @@ def test_attn_impl(attn_impl_0: str,
                    attn_impl_1: str,
                    clip_qkv: bool,
                    qk_ln: bool,
-                   alibi: bool,
+                   pos_emb_config: dict,
                    attn_type: str,
                    device: str = 'cuda'):
     """Compare all attn impl with each other.
 
-    Includes testing with and without attn_clip_qkv, attn_qk_ln, and alibi.
+    Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and
+    rope.
     """
     from llmfoundry.models.layers import attention
-
+    alibi = pos_emb_config['alibi']
+    rope = pos_emb_config['rope']
     if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
         pytest.xfail('flash attn does not support alibi')
 
+    if rope and (pos_emb_config['rope_impl']
+                 == 'dail') and (not is_flash_v2_installed()):
+        pytest.skip('dail implementation of rope requires flash attention 2.')
+
     cfg = om.create({
         'attn_impl': 'flash',
         'd_model': 128,
@@ -48,7 +82,7 @@ def test_attn_impl(attn_impl_0: str,
     })
 
     n, s, f = 2, 16, cfg.d_model
-
+    assert cfg.d_model % cfg.n_heads == 0
     if attn_type == 'grouped_query_attention':
         cfg.kv_n_heads = 2
 
@@ -91,16 +125,45 @@ def gen_bias(attn_impl: str):
 
     with torch.autocast(x0.device.type):
         attn_bias = gen_bias(attn0.attn_impl)
+
+        rotary_emb_w_meta_info = None
+        if rope:
+            rotary_embedding = gen_rotary_embedding(
+                rope_head_dim=cfg.d_model // cfg.n_heads,
+                rope_impl=pos_emb_config['rope_impl'],
+                rope_theta=pos_emb_config['rope_theta'],
+                rope_dail_config=pos_emb_config.get('rope_dail_config', {}),
+                rope_hf_config=pos_emb_config.get('rope_hf_config', {}),
+                max_seq_len=s).to(device)
+            pos = torch.arange(s).unsqueeze(0).to(device=device)
+            # adjust the position indices to account for padding tokens
+            pos = torch.clamp(
+                pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1),
+                min=0,
+            )
+            rotary_emb_w_meta_info = {
+                'impl':
+                    pos_emb_config['rope_impl'],
+                'rotary_emb':
+                    rotary_embedding,
+                'offset_info':
+                    pos if (pos_emb_config['rope_impl'] == 'hf') else 0,
+                'seq_len':
+                    s,
+            }
+
         y0, _, _ = attn0(x0,
                          past_key_value=None,
                          attn_bias=attn_bias,
                          attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                          is_causal=True)
         attn_bias = gen_bias(attn1.attn_impl)
         y1, _, _ = attn1(x1,
                          past_key_value=None,
                          attn_bias=attn_bias,
                          attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                          is_causal=True)
         y0 *= attention_mask.unsqueeze(-1)
         y1 *= attention_mask.unsqueeze(-1)
diff --git a/tests/test_model.py b/tests/test_model.py
index 1c7033ed48..41b62f0ccf 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -16,7 +16,7 @@
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
-from composer.utils import dist, get_device
+from composer.utils import dist, get_device, reproducibility
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
@@ -28,6 +28,7 @@
 from llmfoundry import COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers import NORM_CLASS_REGISTRY, build_alibi_bias
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
 from llmfoundry.utils import build_tokenizer
@@ -517,16 +518,49 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
                                                    ('flash', 'gpu'),
                                                    ('triton', 'gpu'),
                                                    ('torch', 'gpu')])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_forward_with_padding(attention_impl: str, device: str,
+                              pos_emb_config: dict):
     # Test that different placement of padding does not affect the output.
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
         )
+    alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
+    rope = pos_emb_config['rope']
+    if rope and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
     composer_device = get_device(device)
 
     hf_config = MPTConfig(
@@ -540,7 +574,7 @@ def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
         init_config={
             'name': 'baseline_',
@@ -612,23 +646,35 @@ def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
                              attention_mask=batched_attention_mask).logits
 
         # check that right padding and left padding produce the same output
+        right_pad_v_left_pad_rtol = 1e-5
+        right_pad_v_left_pad_atol = 1e-6 if attention_impl == 'torch' else 1e-8
+        if rope and pos_emb_config['rope_impl'] == 'dail':
+            # dail implementation of rope uses bf16 precision and hence the rotations have small numerical errors. This causes some differences between the outputs of padded and unpadded inputs.
+            right_pad_v_left_pad_rtol = 1e-2
+            right_pad_v_left_pad_atol = 1e-2
         assert torch.allclose(right_padding_output[0, :3],
                               left_padding_output[0, 3:],
-                              atol=1e-6 if attention_impl == 'torch' else 1e-8)
-        if not alibi:
+                              rtol=right_pad_v_left_pad_rtol,
+                              atol=right_pad_v_left_pad_atol)
+
+        if not (alibi or (rope and pos_emb_config['rope_impl'] == 'dail')):
             # check that right padding and middle padding produce the same output
             # Note: alibi not implemented for middle padding.
+            # Note: dail implementation of rope does not support middle padding.
             assert torch.allclose(
                 right_padding_output[0, :3],
                 middle_padding_output[0, [0, 1, 5]],
                 atol=1e-6 if attention_impl == 'torch' else 1e-8)
+
         # check that right padding and right padding in a batch produce the same output
         assert torch.allclose(right_padding_output[0, :3],
                               batched_output[0, :3],
                               atol=1e-6 if attention_impl == 'torch' else 1e-8)
-        if not alibi:
+
+        if not (alibi or (rope and pos_emb_config['rope_impl'] == 'dail')):
             # check that middle padding and middle padding in a batch produce the same output
             # Note: alibi not implemented for middle padding.
+            # Note: dail implementation of rope does not support middle padding.
             assert torch.allclose(
                 middle_padding_output[0],
                 batched_output[1, :],
@@ -694,17 +740,47 @@ def test_advanced_mask_building(attention_impl: str):
                                                    ('flash', 'gpu'),
                                                    ('triton', 'gpu'),
                                                    ('torch', 'gpu')])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generate(attention_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
     # Test that generate works, and produces the same output with or without
     # padding in the input.
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
         )
-    if alibi and attention_impl == 'flash':
+    if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
     composer_device = get_device(device)
 
     hf_config = MPTConfig(
@@ -718,7 +794,7 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
     )
     mpt = MPTForCausalLM(hf_config)
@@ -886,9 +962,54 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     check_hf_model_equivalence(mpt, mpt2)
 
 
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_cache_and_padding(alibi: bool):
+@pytest.mark.parametrize('attn_impl,device', [
+    ('torch', 'cpu'),
+    ('flash', 'gpu'),
+    ('triton', 'gpu'),
+    ('torch', 'gpu'),
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_forward_with_cache_and_padding(attn_impl: str, device: str,
+                                        pos_emb_config: dict):
     # Tests that the result is the same with or without padding when using kv caching
+    if not torch.cuda.is_available() and device == 'gpu':
+        pytest.skip(
+            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
+        )
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(device)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -899,8 +1020,8 @@ def test_forward_with_cache_and_padding(alibi: bool):
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
@@ -910,47 +1031,74 @@ def test_forward_with_cache_and_padding(alibi: bool):
     )
 
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
-
-    first_input_ids_no_padding = torch.tensor([[11274, 16390, 11]])
-    first_attention_mask_no_padding = torch.tensor([[1, 1, 1]]).bool()
-
-    # start with passing the first three tokens through (no padding)
-    first_output_no_padding = mpt(
-        first_input_ids_no_padding,
-        attention_mask=first_attention_mask_no_padding)
-
-    second_input_ids_no_padding = torch.tensor([[11274, 16390, 11, 11274]])
-    second_attention_mask_no_padding = torch.tensor([[1, 1, 1, 1]]).bool()
-
-    # pass through the fourth token by itself, using the key-value cache (no padding)
-    second_output_no_padding = mpt(
-        second_input_ids_no_padding[:, -1].unsqueeze(-1),
-        attention_mask=second_attention_mask_no_padding,
-        past_key_values=first_output_no_padding.past_key_values)
-
-    first_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11]])
-    first_attention_mask_padding = torch.tensor([[0, 1, 1, 1]]).bool()
-
-    # start with passing the first three tokens through (with left padding)
-    first_output_padding = mpt(first_input_ids_padding,
-                               attention_mask=first_attention_mask_padding)
-
-    second_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11, 11274]])
-    second_attention_mask_padding = torch.tensor([[0, 1, 1, 1, 1]]).bool()
-
-    # pass through the fourth token by itself, using the key-value cache (with left padding)
-    second_output_padding = mpt(
-        second_input_ids_padding[:, -1].unsqueeze(-1),
-        attention_mask=second_attention_mask_padding,
-        past_key_values=first_output_padding.past_key_values)
-
-    # check that the outputs are the same with or without padding
-    torch.testing.assert_close(second_output_no_padding.logits,
-                               second_output_padding.logits[:,
-                                                            -1, :].unsqueeze(1),
-                               atol=1e-6,
-                               rtol=1e-6)
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        first_input_ids_no_padding = torch.tensor([[11274, 16390, 11]])
+        first_input_ids_no_padding = composer_device.tensor_to_device(
+            first_input_ids_no_padding)
+        first_attention_mask_no_padding = torch.tensor([[1, 1, 1]]).bool()
+        first_attention_mask_no_padding = composer_device.tensor_to_device(
+            first_attention_mask_no_padding)
+
+        # start with passing the first three tokens through (no padding)
+        first_output_no_padding = mpt(
+            first_input_ids_no_padding,
+            attention_mask=first_attention_mask_no_padding)
+
+        second_input_ids_no_padding = torch.tensor([[11274, 16390, 11, 11274]])
+        second_input_ids_no_padding = composer_device.tensor_to_device(
+            second_input_ids_no_padding)
+        second_attention_mask_no_padding = torch.tensor([[1, 1, 1, 1]]).bool()
+        second_attention_mask_no_padding = composer_device.tensor_to_device(
+            second_attention_mask_no_padding)
+
+        # pass through the fourth token by itself, using the key-value cache (no padding)
+        second_output_no_padding = mpt(
+            second_input_ids_no_padding[:, -1].unsqueeze(-1),
+            attention_mask=second_attention_mask_no_padding,
+            past_key_values=first_output_no_padding.past_key_values)
+
+        first_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11]])
+        first_input_ids_padding = composer_device.tensor_to_device(
+            first_input_ids_padding)
+        first_attention_mask_padding = torch.tensor([[0, 1, 1, 1]]).bool()
+        first_attention_mask_padding = composer_device.tensor_to_device(
+            first_attention_mask_padding)
+
+        # start with passing the first three tokens through (with left padding)
+        first_output_padding = mpt(first_input_ids_padding,
+                                   attention_mask=first_attention_mask_padding)
+
+        second_input_ids_padding = torch.tensor(
+            [[50256, 11274, 16390, 11, 11274]])
+        second_input_ids_padding = composer_device.tensor_to_device(
+            second_input_ids_padding)
+        second_attention_mask_padding = torch.tensor([[0, 1, 1, 1, 1]]).bool()
+        second_attention_mask_padding = composer_device.tensor_to_device(
+            second_attention_mask_padding)
+
+        # pass through the fourth token by itself, using the key-value cache (with left padding)
+        second_output_padding = mpt(
+            second_input_ids_padding[:, -1].unsqueeze(-1),
+            attention_mask=second_attention_mask_padding,
+            past_key_values=first_output_padding.past_key_values)
+
+        # check that the outputs are the same with or without padding
+        if pos_emb_config['rope'] and pos_emb_config[
+                'rope_impl'] == 'dail':  # dail implementation of rope uses bf16 precision and hence the rotations have small numerical errors. This causes some differences between the outputs of padded and unpadded inputs.
+            torch.testing.assert_close(
+                second_output_no_padding.logits,
+                second_output_padding.logits[:, -1, :].unsqueeze(1),
+                atol=1e-2,
+                rtol=1e-6)
+        else:
+            torch.testing.assert_close(
+                second_output_no_padding.logits,
+                second_output_padding.logits[:, -1, :].unsqueeze(1),
+                atol=1e-6,
+                rtol=1e-6)
 
 
 @pytest.mark.parametrize('attn_impl,device', [
@@ -959,17 +1107,47 @@ def test_forward_with_cache_and_padding(alibi: bool):
     ('triton', 'gpu'),
     ('torch', 'gpu'),
 ])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict):
     # Test that model forward with and without the key-value cache produces the
     # same output.
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
         )
-    if alibi and attn_impl == 'flash':
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
     composer_device = get_device(device)
 
     hf_config = MPTConfig(
@@ -983,10 +1161,8 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attn_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
-        attn_impl=attn_impl,
-        alibi=alibi,
         use_cache=True,
         init_config={
             'name': 'baseline_',
@@ -1066,8 +1242,53 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
         )
 
 
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generate_with_past_kv(alibi: bool):
+@pytest.mark.parametrize('attn_impl,device', [
+    ('torch', 'cpu'),
+    ('flash', 'gpu'),
+    ('triton', 'gpu'),
+    ('torch', 'gpu'),
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_generate_with_past_kv(attn_impl: str, device: str,
+                               pos_emb_config: dict):
+    if not torch.cuda.is_available() and device == 'gpu':
+        pytest.skip(
+            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
+        )
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(device)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1078,8 +1299,8 @@ def test_generate_with_past_kv(alibi: bool):
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
@@ -1088,33 +1309,46 @@ def test_generate_with_past_kv(alibi: bool):
         },
     )
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     # no padding in the input
     no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
+    no_padding_input_ids = composer_device.tensor_to_device(
+        no_padding_input_ids)
     no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+    no_padding_attention_mask = composer_device.tensor_to_device(
+        no_padding_attention_mask)
 
-    with mock.patch.object(MPTForCausalLM, 'forward',
-                           autospec=True) as forward_mocked:
-        forward_mocked.return_value = CausalLMOutputWithPast(
-            logits=torch.randn((1, 3, hf_config.vocab_size)),
-            past_key_values=[(torch.randn(1, 3, hf_config.d_model),
-                              torch.randn(1, 3, hf_config.d_model))
-                             for _ in range(hf_config.n_layers)])
-        _ = mpt.generate(input_ids=no_padding_input_ids,
-                         attention_mask=no_padding_attention_mask,
-                         max_new_tokens=2)
-
-        assert forward_mocked.call_count == 2
-        _, _, kwargs = forward_mocked.mock_calls[0]
-        assert kwargs['past_key_values'] is None
-        _, _, kwargs = forward_mocked.mock_calls[1]
-        assert kwargs['past_key_values'] is not None
-        assert len(kwargs['past_key_values']) == hf_config.n_layers
-        assert kwargs['past_key_values'][0][0].shape == (1, 3,
-                                                         hf_config.d_model)
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        with mock.patch.object(MPTForCausalLM, 'forward',
+                               autospec=True) as forward_mocked:
+            forward_mocked.return_value = CausalLMOutputWithPast(
+                logits=torch.randn((1, 3, hf_config.vocab_size)),
+                past_key_values=[(torch.randn(1, 3, hf_config.d_model),
+                                  torch.randn(1, 3, hf_config.d_model))
+                                 for _ in range(hf_config.n_layers)])
+            _ = mpt.generate(input_ids=no_padding_input_ids,
+                             attention_mask=no_padding_attention_mask,
+                             max_new_tokens=2)
+
+            assert forward_mocked.call_count == 2
+            _, _, kwargs = forward_mocked.mock_calls[0]
+            assert kwargs['past_key_values'] is None
+            _, _, kwargs = forward_mocked.mock_calls[1]
+            assert kwargs['past_key_values'] is not None
+            assert len(kwargs['past_key_values']) == hf_config.n_layers
+            assert kwargs['past_key_values'][0][0].shape == (1, 3,
+                                                             hf_config.d_model)
 
 
+@pytest.mark.parametrize('attn_impl,device', [
+    ('torch', 'cpu'),
+    ('flash', 'gpu'),
+    ('triton', 'gpu'),
+    ('torch', 'gpu'),
+])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,
     'num_beams': 4
@@ -1126,9 +1360,49 @@ def test_generate_with_past_kv(alibi: bool):
     'do_sample': True,
     'top_p': 0.95
 }])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generation_kwargs_dont_crash(generation_kwargs: Dict[str, Any],
-                                      alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
+                                      generation_kwargs: Dict[str, Any],
+                                      pos_emb_config: dict):
+    if not torch.cuda.is_available() and device == 'gpu':
+        pytest.skip(
+            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
+        )
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+    composer_device = get_device(device)
+    if device == 'gpu':  # Switch deteminism off
+        torch.use_deterministic_algorithms(False)
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1139,35 +1413,73 @@ def test_generation_kwargs_dont_crash(generation_kwargs: Dict[str, Any],
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
     )
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
-    # no padding in the input
-    no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
-    no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        # no padding in the input
+        no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
+        no_padding_input_ids = composer_device.tensor_to_device(
+            no_padding_input_ids)
+        no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+        no_padding_attention_mask = composer_device.tensor_to_device(
+            no_padding_attention_mask)
 
-    _ = mpt.generate(input_ids=no_padding_input_ids,
-                     attention_mask=no_padding_attention_mask,
-                     **generation_kwargs)
+        _ = mpt.generate(input_ids=no_padding_input_ids,
+                         attention_mask=no_padding_attention_mask,
+                         **generation_kwargs)
+    if device == 'gpu':  # Switch deteminism back on
+        reproducibility.configure_deterministic_mode()
 
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton'])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_model_to(attention_impl: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_model_to(attention_impl: str, pos_emb_config: dict):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
     if not torch.cuda.is_available():
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
         )
-    if alibi and attention_impl == 'flash':
+    if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(f'dail implementation of rope requires flash attention 2.')
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1179,7 +1491,7 @@ def test_model_to(attention_impl: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
@@ -1204,7 +1516,8 @@ def test_model_to(attention_impl: str, alibi: bool):
     mpt = mpt.to('cpu')
 
     # verify the model still works
-    if attention_impl == 'torch':
+    if attention_impl == 'torch' and not (
+            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         with torch.autocast('cpu', dtype=torch.bfloat16, enabled=True):
             _ = mpt(input_ids.to('cpu'),
                     attention_mask=attention_mask.to('cpu'))
@@ -1221,7 +1534,8 @@ def test_model_to(attention_impl: str, alibi: bool):
     mpt = mpt.float()
 
     # verify the model still works
-    if attention_impl == 'torch':
+    if attention_impl == 'torch' and not (
+            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu'))
 
     mpt = mpt.half()
@@ -1258,21 +1572,50 @@ def test_alibi_vs_hf():
     ('triton', 'gpu'),
     ('torch', 'gpu'),
 ])
-@pytest.mark.parametrize('alibi', [True, False])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
 @pytest.mark.parametrize('output_attentions', [True, False])
 @pytest.mark.parametrize('output_hidden_states', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, device: str, alibi: bool, output_attentions: bool,
-        output_hidden_states: bool):
+        attn_impl: str, device: str, pos_emb_config: dict,
+        output_attentions: bool, output_hidden_states: bool):
     # Test that model forward with output_attentions_and_output_hidden_states
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
         )
-    if alibi and attn_impl == 'flash':
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if output_attentions and attn_impl in ['flash', 'triton']:
         pytest.skip(f'output_attentions only implemented with torch attention.')
+    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
+            device != 'gpu' or not is_flash_v2_installed()):
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
 
     composer_device = get_device(device)
 
@@ -1289,10 +1632,8 @@ def test_forward_with_output_attentions_and_output_hidden_states(
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attn_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
-        attn_impl=attn_impl,
-        alibi=alibi,
         use_cache=True,
         init_config={
             'name': 'baseline_',
diff --git a/tests/test_rope_dail_vs_hf.py b/tests/test_rope_dail_vs_hf.py
new file mode 100644
index 0000000000..598e308546
--- /dev/null
+++ b/tests/test_rope_dail_vs_hf.py
@@ -0,0 +1,145 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+from composer.core.precision import get_precision_context
+from omegaconf import OmegaConf as om
+
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('clip_qkv', [True, False])
+@pytest.mark.parametrize('qk_ln', [True, False])
+@pytest.mark.parametrize(
+    'attn_type',
+    ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
+@pytest.mark.parametrize('seq_len', [1, 233, 2048])
+def test_rope_dail_vs_hf(clip_qkv: bool,
+                         qk_ln: bool,
+                         attn_type: str,
+                         seq_len: int,
+                         device: str = 'cuda'):
+    # compare rope rotations for the dail vs hf implementations
+    if not is_flash_v2_installed():
+        pytest.skip('dail implementation of rope requires flash attention 2.')
+
+    from llmfoundry.models.layers import attention
+
+    cfg = om.create({
+        'attn_impl': 'flash',
+        'd_model': 128,
+        'n_heads': 4,
+        'attn_pdrop': 0,
+        'clip_qkv': clip_qkv,
+        'qk_ln': qk_ln,
+    })
+
+    batch_size = 2
+    assert cfg.d_model % cfg.n_heads == 0
+    if attn_type == 'grouped_query_attention':
+        cfg.kv_n_heads = 2
+
+    attn0 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+    attn1 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+
+    attn1.load_state_dict(attn0.state_dict())
+    x0 = torch.randn(batch_size, seq_len, cfg.d_model).to(device)
+    x1 = x0.clone().detach()
+    x0.requires_grad = True
+    x1.requires_grad = True
+    attention_mask = torch.ones(batch_size, seq_len).to(device).bool()
+
+    with get_precision_context('amp_bf16'):
+        dail_rope_config = {
+            'rope_theta': 10000,
+            'rope_impl': 'dail',
+            'rope_dail_config': {
+                'type': 'original',
+                'pos_idx_in_fp32': True,
+                'xpos_scale_base': 512,
+            }
+        }
+        hf_rope_config = {
+            'rope_theta': 10000,
+            'rope_impl': 'hf',
+            'rope_hf_config': {
+                'type': 'no_scaling',
+                'factor': 1.0,
+            }
+        }
+
+        dail_rope = gen_rotary_embedding(
+            rope_head_dim=cfg.d_model // cfg.n_heads,
+            rope_impl=dail_rope_config['rope_impl'],
+            rope_theta=dail_rope_config['rope_theta'],
+            rope_dail_config=dail_rope_config['rope_dail_config'],
+            rope_hf_config={},
+            max_seq_len=seq_len).to('cuda')
+        dail_rope_w_meta_info = {
+            'impl': 'dail',
+            'rotary_emb': dail_rope,
+            'offset_info': 0,
+            'seq_len': seq_len,
+        }
+
+        hf_rope = gen_rotary_embedding(
+            rope_head_dim=cfg.d_model // cfg.n_heads,
+            rope_impl=hf_rope_config['rope_impl'],
+            rope_theta=hf_rope_config['rope_theta'],
+            rope_dail_config={},
+            rope_hf_config=hf_rope_config['rope_hf_config'],
+            max_seq_len=seq_len).to('cuda')
+        pos = torch.arange(seq_len).unsqueeze(0).to(device='cuda')
+        # adjust the position indices to account for padding tokens
+        pos = torch.clamp(
+            pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1),
+            min=0,
+        )
+        hf_rope_w_meta_info = {
+            'impl': 'hf',
+            'rotary_emb': hf_rope,
+            'offset_info': pos,
+            'seq_len': seq_len,
+        }
+
+        y0, _, _ = attn0(x0,
+                         past_key_value=None,
+                         attn_bias=None,
+                         attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=dail_rope_w_meta_info,
+                         is_causal=True)
+
+        y1, _, _ = attn1(x1,
+                         past_key_value=None,
+                         attn_bias=None,
+                         attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=hf_rope_w_meta_info,
+                         is_causal=True)
+
+        y0 *= attention_mask.unsqueeze(-1)
+        y1 *= attention_mask.unsqueeze(-1)
+
+        loss0 = y0.sum()
+        loss1 = y1.sum()
+
+    loss0.backward()
+    loss1.backward()
+
+    torch.testing.assert_close(y0, y1, rtol=1e-2, atol=1e-2)
+
+    torch_name_param_map = {n: p for n, p in attn1.named_parameters()}
+    for n, p in attn0.named_parameters():
+        tp = torch_name_param_map[n]
+        assert p.grad is not None
+        assert tp.grad is not None
+        torch.testing.assert_close(p, tp, rtol=1e-2, atol=1e-2)
+        # Relaxed to a l2-norm based check.
+        assert torch.norm(tp.grad - p.grad) <= 1e-2 + 1e-2 * torch.norm(p.grad)
+
+    assert x0.grad is not None
+    assert x1.grad is not None
+    # Relaxed to a l2-norm based check.
+    assert torch.norm(x0.grad - x1.grad) <= 1e-2 + 1e-2 * torch.norm(x0.grad)

From 2b74cb25060c0eb7c7961a0f82be3ebc4afc5e07 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Mon, 6 Nov 2023 15:33:46 -0800
Subject: [PATCH 23/49] Add databricks dependency (#717)

---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index f528838d35..81178686d2 100644
--- a/setup.py
+++ b/setup.py
@@ -83,6 +83,10 @@
     'hf_transfer==0.1.3',
 ]
 
+extra_deps['databricks'] = [
+    'mosaicml[databricks]',
+]
+
 extra_deps['tensorboard'] = [
     'mosaicml[tensorboard]>=0.16.1,<0.17',
 ]

From dd15791818fa53ae792de66d3529d94e0dcb83d9 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 6 Nov 2023 23:19:54 -0800
Subject: [PATCH 24/49] Set persistent_workers = False for packing profiling
 (#718)

---
 llmfoundry/data/finetuning/dataloader.py | 7 +++++++
 llmfoundry/data/packing.py               | 1 +
 2 files changed, 8 insertions(+)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 6e988ac149..44d6d345f5 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -400,6 +400,13 @@ def _build_collate_fn(
         packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer,
                                            device_batch_size)
 
+    if isinstance(packing_ratio, str):
+        raise ValueError(
+            'dataset.packing_ratio must be a float or "auto", but it was set to '
+            + f'{packing_ratio}.')
+
+    log.info(f'Using packing ratio {packing_ratio}')
+
     if packing_ratio == 1.0:
         return collate_fn, device_batch_size
     elif packing_ratio < 1.0:
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 1ae9efcce5..45322c9b2f 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -348,6 +348,7 @@ def profile_packing(
     dataloader_cfg.drop_last = False
     dataloader_cfg.num_workers = 0
     dataloader_cfg.prefetch_factor = None
+    dataloader_cfg.persistent_workers = False
 
     # Determine the packing_ratio values we'll try
     packing_ratios, raw_batch_sizes = [], []

From 84c86e3b0a3b63c0c71e52f1f762325daa8adc64 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Nov 2023 15:25:09 -0500
Subject: [PATCH 25/49] raise timeout (#719)

---
 .github/workflows/pr-gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 1151837111..ffbfac4585 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -40,7 +40,7 @@ jobs:
     if: github.repository_owner == 'mosaicml'
     with:
       container: ${{ matrix.container }}
-      mcloud-timeout: 1200
+      mcloud-timeout: 1800
       name: ${{ matrix.name }}
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}

From ab9b9385ed4a89749e853b59729982144bbb35f6 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 8 Nov 2023 17:22:50 -0800
Subject: [PATCH 26/49] change default overwrite to True (#724)

---
 llmfoundry/callbacks/hf_checkpointer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 3050529a5a..4f400738e4 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -53,7 +53,7 @@ def __init__(
         save_interval: Union[str, int, Time],
         huggingface_folder_name: str = 'ba{batch}',
         precision: str = 'float32',
-        overwrite: bool = False,
+        overwrite: bool = True,
         mlflow_registered_model_name: Optional[str] = None,
         mlflow_logging_config: Optional[dict] = None,
     ):

From efaa5454304f43a3d3525a54a6445b656b1cef24 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 9 Nov 2023 07:39:12 -0800
Subject: [PATCH 27/49] Attempt to fix a very occasional hang in datasets
 map/filter (#725)

* dont use lambdas
* tokenizer building distributed safety
---
 llmfoundry/data/finetuning/tasks.py | 16 ++++++++++++----
 llmfoundry/utils/builders.py        | 15 +++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 3673a48217..67a27ac239 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -362,8 +362,12 @@ def dataset_mapper(example: Dict):
             num_proc=num_cpus_to_use,
             desc='Tokenizing dataset',
         )
+
+        def filter_long_prompts(example: Dict) -> bool:
+            return len(example['input_ids']) < max_seq_len
+
         prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len,
+            filter_long_prompts,
             num_proc=num_cpus_to_use,
             desc='Filtering out long prompts',
         )
@@ -376,10 +380,14 @@ def dataset_mapper(example: Dict):
             )
 
         pad_token_id = tokenizer.pad_token_id
+
+        def filter_empty_examples(example: Dict) -> bool:
+            return len(example['input_ids']) > 0 and len(
+                example['labels']) > 0 and any(
+                    token_id != pad_token_id for token_id in example['labels'])
+
         empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
-            lambda example: len(example['input_ids']) > 0 and len(example[
-                'labels']) > 0 and any(token_id != pad_token_id
-                                       for token_id in example['labels']),
+            filter_empty_examples,
             num_proc=num_cpus_to_use,
             desc='Filtering out empty examples')
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index f027afb0ce..2251ab5fbd 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -188,6 +188,12 @@ def build_tokenizer(
     os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
+    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+
+    # Make sure the tokenizer files are downloaded and cached first by local rank 0
+    with dist.local_rank_zero_download_and_wait(signal_file_path):
+        pass
+
     if tokenizer_name.startswith('tiktoken'):
         tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
     else:
@@ -202,6 +208,15 @@ def build_tokenizer(
             int(1e30),
         )
 
+    if dist.get_local_rank() == 0:
+        with open(signal_file_path, 'wb') as f:
+            f.write(b'local_rank0_completed_tokenizer_setup')
+
+    dist.barrier()
+
+    if dist.get_local_rank() == 0:
+        os.remove(signal_file_path)
+
     return tokenizer
 
 

From d2ddb834650b085337c4d914f77bb80c76201e9d Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 9 Nov 2023 10:25:13 -0800
Subject: [PATCH 28/49] Add Unity Catalog support to HF checkpointer (#721)

---
 llmfoundry/callbacks/hf_checkpointer.py | 31 +++++++++++--------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 4f400738e4..e02bf03693 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -14,9 +14,10 @@
 from composer.core import Callback, Event, State, Time, TimeUnit
 from composer.core.state import fsdp_state_dict_type_context
 from composer.loggers import Logger, MLFlowLogger
-from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
 from composer.models import HuggingFaceModel
-from composer.utils import dist, format_name_with_dist_and_time, parse_uri
+from composer.utils import (dist, format_name_with_dist_and_time,
+                            maybe_create_remote_uploader_downloader_from_uri,
+                            parse_uri)
 from composer.utils.misc import create_interval_scheduler
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
@@ -57,8 +58,7 @@ def __init__(
         mlflow_registered_model_name: Optional[str] = None,
         mlflow_logging_config: Optional[dict] = None,
     ):
-        self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(
-            save_folder)
+        _, _, self.save_dir_format_str = parse_uri(save_folder)
         self.overwrite = overwrite
         self.precision = precision
         self.dtype = {
@@ -93,13 +93,11 @@ def __init__(
         self.save_interval = save_interval
         self.check_interval = create_interval_scheduler(
             save_interval, include_end_of_training=True)
-        self.upload_to_object_store = (self.backend != '')
-        if self.upload_to_object_store:
-            self.remote_ud = RemoteUploaderDownloader(
-                bucket_uri=f'{self.backend}://{self.bucket_name}',
-                num_concurrent_uploads=4)
-        else:
-            self.remote_ud = None
+
+        self.remote_ud = maybe_create_remote_uploader_downloader_from_uri(
+            save_folder, loggers=[])
+        if self.remote_ud is not None:
+            self.remote_ud._num_concurrent_uploads = 4
 
         self.last_checkpoint_batch: Optional[Time] = None
         self.mlflow_loggers = []
@@ -115,7 +113,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
                 raise ValueError(
                     f'`HuggingFaceCheckpointer` is only compatible with `HuggingFaceModel`s. '
                     + f'Got {type(state.model)} instead.')
-            if self.upload_to_object_store and self.remote_ud is not None:
+            if self.remote_ud is not None:
                 self.remote_ud.init(state, logger)
                 state.callbacks.append(self.remote_ud)
 
@@ -169,7 +167,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 self.huggingface_folder_name_fstr), state.run_name,
             state.timestamp)
         dir_context_mgr = tempfile.TemporaryDirectory(
-        ) if self.upload_to_object_store else contextlib.nullcontext(
+        ) if self.remote_ud is not None else contextlib.nullcontext(
             enter_result=save_dir)
 
         with dir_context_mgr as temp_save_dir:
@@ -233,11 +231,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
                     log.debug('Editing MPT files for HuggingFace compatibility')
                     edit_files_for_hf_compatibility(temp_save_dir)
 
-                if self.upload_to_object_store:
-                    assert self.remote_ud is not None
-                    log.info(
-                        f'Uploading HuggingFace formatted checkpoint to {self.backend}://{self.bucket_name}/{save_dir}'
-                    )
+                if self.remote_ud is not None:
+                    log.info(f'Uploading HuggingFace formatted checkpoint')
                     for filename in os.listdir(temp_save_dir):
                         self.remote_ud.upload_file(
                             state=state,

From 2f91a64a348b0f745ab83f66acdad2a07082cc14 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 9 Nov 2023 16:40:26 -0800
Subject: [PATCH 29/49] Combine filters into one, to avoid datasets error
 (#729)

---
 llmfoundry/data/finetuning/tasks.py | 46 +++++++++++------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 67a27ac239..6ba6ad96c8 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -363,43 +363,31 @@ def dataset_mapper(example: Dict):
             desc='Tokenizing dataset',
         )
 
-        def filter_long_prompts(example: Dict) -> bool:
-            return len(example['input_ids']) < max_seq_len
+        pad_token_id = tokenizer.pad_token_id
 
-        prompt_length_filtered_dataset = tokenized_dataset.filter(
-            filter_long_prompts,
+        def filter_long_or_empty_examples(example: Dict) -> bool:
+            less_than_max_seq_len = len(example['input_ids']) < max_seq_len
+            non_empty_input = len(example['input_ids']) > 0
+            non_empty_labels = len(example['labels']) > 0
+            non_padding_response = any(
+                token_id != pad_token_id for token_id in example['labels'])
+            return (less_than_max_seq_len and non_empty_input and
+                    non_empty_labels and non_padding_response)
+
+        filtered_dataset = tokenized_dataset.filter(
+            filter_long_or_empty_examples,
             num_proc=num_cpus_to_use,
             desc='Filtering out long prompts',
         )
 
-        examples_removed = len(tokenized_dataset) - len(
-            prompt_length_filtered_dataset)
+        examples_removed = len(tokenized_dataset) - len(filtered_dataset)
         if examples_removed > 0:
             warnings.warn(
-                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
+                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}, '
+                +
+                'the prompt or response was empty, or the response was all padding tokens.'
             )
 
-        pad_token_id = tokenizer.pad_token_id
-
-        def filter_empty_examples(example: Dict) -> bool:
-            return len(example['input_ids']) > 0 and len(
-                example['labels']) > 0 and any(
-                    token_id != pad_token_id for token_id in example['labels'])
-
-        empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
-            filter_empty_examples,
-            num_proc=num_cpus_to_use,
-            desc='Filtering out empty examples')
-
-        log.debug('Done tokenizing and filtering examples.')
-
-        empty_examples_removed = len(prompt_length_filtered_dataset) - len(
-            empty_examples_dropped_dataset)
-        if empty_examples_removed > 0:
-            warnings.warn(
-                f'Dropped {empty_examples_removed} examples where the prompt or response was empty, '
-                + 'or the response was only padding tokens.')
-
         # Now local rank 0 indicates to the other ranks that it is done
         if dist.get_local_rank() == 0:
             log.debug('Local rank 0 finished data prep')
@@ -414,7 +402,7 @@ def filter_empty_examples(example: Dict) -> bool:
             os.remove(signal_file_path)
 
         log.debug('All ranks finished data prep')
-        return empty_examples_dropped_dataset
+        return filtered_dataset
 
     def build_from_streaming(self, *args: Any,
                              **kwargs: Any) -> StreamingFinetuningDataset:

From 7c4d24a8bc3713d07de889df5c2f21211ae4945e Mon Sep 17 00:00:00 2001
From: Jerry Chen <jerry@mosaicml.com>
Date: Thu, 9 Nov 2023 17:16:39 -0800
Subject: [PATCH 30/49] Fix logging verbosity in HF model download script and
 repair symlinks  (#727)

* Make logs appear and disable InsecureRequestWarning for ignore_cert

* Clean up

* Repair symlinks after cache download

* Clean up logging

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/utils/model_download_utils.py | 27 +++++++++++++++---------
 scripts/misc/download_hf_model.py        | 20 ++++++++++++++++--
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
index d268cb78b7..2104455e0f 100644
--- a/llmfoundry/utils/model_download_utils.py
+++ b/llmfoundry/utils/model_download_utils.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import time
+import warnings
 from http import HTTPStatus
 from typing import Optional
 from urllib.parse import urljoin
@@ -14,6 +15,7 @@
 import requests
 import tenacity
 from bs4 import BeautifulSoup
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
 from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
 from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
@@ -212,16 +214,21 @@ def download_from_cache_server(
 
         download_start = time.time()
 
-        # Only downloads the blobs in order to avoid downloading model files twice due to the
-        # symlnks in the Hugging Face cache structure:
-        _recursive_download(
-            session,
-            cache_base_url,
-            # Trailing slash to indicate directory
-            f'{formatted_model_name}/blobs/',
-            save_dir,
-            ignore_cert=ignore_cert,
-        )
+        # Temporarily suppress noisy SSL certificate verification warnings if ignore_cert is set to True
+        with warnings.catch_warnings():
+            if ignore_cert:
+                warnings.simplefilter('ignore', category=InsecureRequestWarning)
+
+            # Only downloads the blobs in order to avoid downloading model files twice due to the
+            # symlnks in the Hugging Face cache structure:
+            _recursive_download(
+                session,
+                cache_base_url,
+                # Trailing slash to indicate directory
+                f'{formatted_model_name}/blobs/',
+                save_dir,
+                ignore_cert=ignore_cert,
+            )
         download_duration = time.time() - download_start
         log.info(
             f'Downloaded model {model_name} from cache server in {download_duration} seconds'
diff --git a/scripts/misc/download_hf_model.py b/scripts/misc/download_hf_model.py
index 6465a552c2..58c3445e7d 100644
--- a/scripts/misc/download_hf_model.py
+++ b/scripts/misc/download_hf_model.py
@@ -14,6 +14,8 @@
 
 HF_TOKEN_ENV_VAR = 'HUGGING_FACE_HUB_TOKEN'
 
+logging.basicConfig(format=f'%(asctime)s: %(levelname)s: %(name)s: %(message)s',
+                    level=logging.INFO)
 log = logging.getLogger(__name__)
 
 if __name__ == '__main__':
@@ -34,7 +36,7 @@
     argparser.add_argument(
         '--fallback',
         action='store_true',
-        default=False,
+        default=True,
         help=
         'Whether to fallback to downloading from Hugging Face if download from cache fails',
     )
@@ -53,11 +55,25 @@
                 token=args.token,
                 ignore_cert=args.ignore_cert,
             )
+
+            # A little hacky: run the Hugging Face download just to repair the symlinks in the HF cache file structure.
+            # This shouldn't actually download any files if the cache server download was successful, but should address
+            # a non-deterministic bug where the symlinks aren't repaired properly by the time the model is initialized.
+            log.info('Repairing Hugging Face cache symlinks')
+
+            # Hide some noisy logs that aren't important for just the symlink repair.
+            old_level = logging.getLogger().level
+            logging.getLogger().setLevel(logging.ERROR)
+            download_from_hf_hub(args.model,
+                                 save_dir=args.save_dir,
+                                 token=args.token)
+            logging.getLogger().setLevel(old_level)
+
         except PermissionError:
             log.error(f'Not authorized to download {args.model}.')
         except Exception as e:
             if args.fallback:
-                log.warn(
+                log.warning(
                     f'Failed to download {args.model} from cache server. Falling back to Hugging Face Hub. Error: {e}'
                 )
                 download_from_hf_hub(args.model,

From c3f0cf976f4de8987fc597b1bed3b0857644f036 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 10 Nov 2023 10:53:09 -0800
Subject: [PATCH 31/49] Gate the dist calls in build_tokenizer (#732)

---
 llmfoundry/utils/builders.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 2251ab5fbd..142e714b55 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -190,9 +190,11 @@ def build_tokenizer(
 
     signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
 
-    # Make sure the tokenizer files are downloaded and cached first by local rank 0
-    with dist.local_rank_zero_download_and_wait(signal_file_path):
-        pass
+    if dist.is_available() and dist.is_initialized(
+    ) and dist.get_world_size() > 1:
+        # Make sure the tokenizer files are downloaded and cached first by local rank 0
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            pass
 
     if tokenizer_name.startswith('tiktoken'):
         tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
@@ -208,14 +210,16 @@ def build_tokenizer(
             int(1e30),
         )
 
-    if dist.get_local_rank() == 0:
-        with open(signal_file_path, 'wb') as f:
-            f.write(b'local_rank0_completed_tokenizer_setup')
+    if dist.is_available() and dist.is_initialized(
+    ) and dist.get_world_size() > 1:
+        if dist.get_local_rank() == 0:
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_tokenizer_setup')
 
-    dist.barrier()
+        dist.barrier()
 
-    if dist.get_local_rank() == 0:
-        os.remove(signal_file_path)
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
 
     return tokenizer
 

From e7223dada7841b087d120030b6e67358bd780473 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Fri, 10 Nov 2023 11:36:34 -0800
Subject: [PATCH 32/49] Create AWS docker image for fine tuning (#731)

---
 .github/workflows/docker.yaml |  7 +++-
 README.md                     | 60 ++++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 83c9a63884..13a835356c 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -29,7 +29,12 @@ jobs:
         - name: '2.1.0_cu121_flash2'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           dep_groups: '[gpu-flash2]'
-
+        - name: '2.1.0_cu121_aws'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
+          dep_groups: '[gpu]'
+        - name: '2.1.0_cu121_flash2_aws'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
+          dep_groups: '[gpu-flash2]'
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4
diff --git a/README.md b/README.md
index 46074613e1..4a4e60e844 100644
--- a/README.md
+++ b/README.md
@@ -45,15 +45,15 @@ You'll find in this repo:
 Mosaic Pretrained Transformers (MPT) are GPT-style models with some special features -- Flash Attention for efficiency, ALiBi for context length extrapolation, and stability improvements to mitigate loss spikes. As part of MosaicML's Foundation series, we have open-sourced several MPT models:
 
 
-| Model              | Context Length | Download                                           | Demo                                                             | Commercial use? |
-|--------------------|----------------|----------------------------------------------------|------------------------------------------------------------------|-----------------|
-| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                                  | Yes             |
-| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                                  | Yes             |
-| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat)      | No              |
-| MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                                  | Yes             |
-| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                                  | Yes             |
-| MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)       | No              |
-| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                                  | Yes             |
+| Model              | Context Length | Download                                           | Demo                                                        | Commercial use? |
+| ------------------ | -------------- | -------------------------------------------------- | ----------------------------------------------------------- | --------------- |
+| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                             | Yes             |
+| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                             | Yes             |
+| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat) | No              |
+| MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                             | Yes             |
+| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                             | Yes             |
+| MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)  | No              |
+| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                             | Yes             |
 
 To try out these models locally, [follow the instructions](https://github.com/mosaicml/llm-foundry/tree/main/scripts/inference#interactive-generation-with-modelgenerate) in `scripts/inference/README.md` to prompt HF models using our [hf_generate.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_generate.py) or [hf_chat.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_chat.py) scripts.
 
@@ -89,17 +89,17 @@ This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems w
 This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems.
 If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix!
 
-| Device                    | Torch Version    | Cuda Version | Status                        |
-|---------------------------|------------------|--------------|-------------------------------|
-| A100-40GB/80GB            | 1.13.1           | 11.7         | :white_check_mark: Supported  |
-| A100-40GB/80GB            | 2.0.1            | 11.7, 11.8   | :white_check_mark: Supported  |
-| A100-40GB/80GB            | 2.1.0            | 11.8, 12.1   | :white_check_mark: Supported  |
-| H100-80GB                 | 1.13.1           | 11.7         | :x: Not Supported             |
-| H100-80GB                 | 2.0.1            | 11.8         | :white_check_mark: Supported  |
-| H100-80GB                 | 2.1.0            | 12.1         | :white_check_mark: Supported  |
-| A10-24GB                  | 1.13.1           | 11.7         | :construction: In Progress    |
-| A10-24GB                  | 2.0.1            | 11.7, 11.8   | :construction: In Progress    |
-| MI250                     | 2.0.1            | ROCm 5.4     | :construction: In Progress    |
+| Device         | Torch Version | Cuda Version | Status                       |
+| -------------- | ------------- | ------------ | ---------------------------- |
+| A100-40GB/80GB | 1.13.1        | 11.7         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.0.1         | 11.7, 11.8   | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.1.0         | 11.8, 12.1   | :white_check_mark: Supported |
+| H100-80GB      | 1.13.1        | 11.7         | :x: Not Supported            |
+| H100-80GB      | 2.0.1         | 11.8         | :white_check_mark: Supported |
+| H100-80GB      | 2.1.0         | 12.1         | :white_check_mark: Supported |
+| A10-24GB       | 1.13.1        | 11.7         | :construction: In Progress   |
+| A10-24GB       | 2.0.1         | 11.7, 11.8   | :construction: In Progress   |
+| MI250          | 2.0.1         | ROCm 5.4     | :construction: In Progress   |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
@@ -111,15 +111,17 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 
 **Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.
 
-| Docker Image                                                | Torch Version  | Cuda Version | LLM Foundry dependencies installed? |
-|-------------------------------------------------------------|----------------|--------------|-------------------------------------|
-| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04`      | 1.13.1         | 11.7         | No                                  |
-| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`       | 2.0.1          | 11.8         | No                                  |
-| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`       | 2.1.0          | 12.1         | No                                  |
-| `mosaicml/llm-foundry:1.13.1_cu117-latest`                  | 1.13.1         | 11.7         | Yes                                 |
-| `mosaicml/llm-foundry:2.0.1_cu118-latest`                   | 2.0.1          | 11.8         | Yes                                 |
-| `mosaicml/llm-foundry:2.1.0_cu121-latest`                   | 2.1.0          | 12.1         | Yes (flash attention v1)            |
-| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`            | 2.1.0          | 12.1         | Yes (flash attention v2)            |
+| Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
+| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
+| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1        | 11.7 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`  | 2.0.1         | 11.8 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
+| `mosaicml/llm-foundry:1.13.1_cu117-latest`             | 1.13.1        | 11.7 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.0.1_cu118-latest`              | 2.0.1         | 11.8 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest`   | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v2)            |
 
 
 # Installation

From d11ba8209bcbd9d1afefa9a468caecdca979c137 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Mon, 13 Nov 2023 10:40:11 -0800
Subject: [PATCH 33/49] Make TiktokenTokenizerWrapper compatible with
 convert_composer_to_hf.py (#730)

---
 .../utils/checkpoint_conversion_helpers.py    | 25 ++++++++++++++++---
 .../inference/convert_composer_mpt_to_ft.py   | 11 ++++++--
 scripts/inference/convert_composer_to_hf.py   | 14 +++++++++--
 tests/test_hf_conversion_script.py            |  3 +++
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index 0627cec4cd..35e77eab6c 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -19,7 +19,8 @@
 
 import numpy as np
 import sentencepiece as spm
-from transformers import AutoTokenizer, PreTrainedTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 log = logging.getLogger(__name__)
 
@@ -35,8 +36,9 @@ def _get_weight_data_type(data_type: str):
 
 # TODO: move this functionality to composer once the bug fixes are upstreamed
 def get_hf_tokenizer_from_composer_state_dict(
-        state_dict: Dict[str, Any],
-        tokenizer_save_dir: Optional[str] = None
+    state_dict: Dict[str, Any],
+    trust_remote_code: bool,
+    tokenizer_save_dir: Optional[str] = None,
 ) -> Optional[PreTrainedTokenizer]:
     if 'state' not in state_dict:
         raise RuntimeError(
@@ -85,7 +87,8 @@ def get_hf_tokenizer_from_composer_state_dict(
                 with open(tokenizer_file_path, 'wb') as _tmp_file:
                     _tmp_file.write(s.serialized_model_proto())
 
-        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
+        hf_tokenizer = load_tokenizer(tokenizer_save_dir,
+                                      trust_remote_code=trust_remote_code)
 
         # remove 'name_or_path'
         hf_tokenizer.name_or_path = ''
@@ -94,6 +97,20 @@ def get_hf_tokenizer_from_composer_state_dict(
     return hf_tokenizer
 
 
+def load_tokenizer(
+    tokenizer_save_dir: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    try:
+        return AutoTokenizer.from_pretrained(
+            tokenizer_save_dir, trust_remote_code=trust_remote_code)
+    except ValueError as e:
+        raise ValueError(
+            f'Got error while loading tokenizer with trust_remote_code={trust_remote_code}: {e}. '
+            +
+            'If accessing a tokenizer defined outside of the transformers module,'
+            + ' please use --trust_remote_code.')
+
+
 def _write_zero_bias(weight_name: str, weight_file_path: str,
                      bias_shape: Union[Tuple[int, ...], int]) -> None:
     """Write zeros for bias when converting MPT to FasterTransformer weights.
diff --git a/scripts/inference/convert_composer_mpt_to_ft.py b/scripts/inference/convert_composer_mpt_to_ft.py
index 79275030b3..f59eb6005a 100644
--- a/scripts/inference/convert_composer_mpt_to_ft.py
+++ b/scripts/inference/convert_composer_mpt_to_ft.py
@@ -67,6 +67,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
         infer_gpu_num: int,
         save_dir: str,
+        trust_remote_code: bool,
         output_precision: str = 'fp32',
         local_checkpoint_save_location: Optional[Union[Path,
                                                        str]] = None) -> None:
@@ -79,6 +80,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by Composer.
         infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         save_dir (str): Path of the directory to save the checkpoint in FT format.
         output_precision (str, optional): The precision of the output weights saved to the FasterTransformer model. Can be either ``fp32`` or ``fp16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
@@ -125,7 +127,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
     print('#' * 30)
     print('Extracting HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is None:
         print('Warning! No HF Tokenizer found!')
 
@@ -206,6 +208,10 @@ def parse_args() -> Namespace:
         'Data type of weights in the FasterTransformer output model. Input checkpoint weights will be converted to this dtype.',
         choices=['fp32', 'fp16'],
         default='fp32')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -229,4 +235,5 @@ def parse_args() -> Namespace:
         infer_gpu_num=args.infer_gpu_num,
         save_dir=save_dir,
         output_precision=args.output_precision,
-        local_checkpoint_save_location=args.local_checkpoint_save_location)
+        local_checkpoint_save_location=args.local_checkpoint_save_location,
+        trust_remote_code=args.trust_remote_code)
diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
index 5625a3b046..1b43762473 100644
--- a/scripts/inference/convert_composer_to_hf.py
+++ b/scripts/inference/convert_composer_to_hf.py
@@ -16,6 +16,7 @@
 
 from llmfoundry import MPTConfig, MPTForCausalLM
 from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
+from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
@@ -23,6 +24,7 @@
 def write_huggingface_pretrained_from_composer_checkpoint(
     checkpoint_path: Union[Path, str],
     output_path: Union[Path, str],
+    trust_remote_code: bool,
     output_precision: str = 'fp32',
     local_checkpoint_save_location: Optional[Union[Path, str]] = None
 ) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
@@ -63,6 +65,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
         output_path (Union[Path, str]): Path to the folder to write the output to.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
                                                                                 If the input ``checkpoint_path`` is already a local path, this will be a symlink.
@@ -110,7 +113,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     print('#' * 30)
     print('Saving HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is not None:
         hf_tokenizer.save_pretrained(output_path)
         print(hf_tokenizer)
@@ -157,6 +160,10 @@ def parse_args() -> Namespace:
                         default='fp32')
     parser.add_argument('--hf_repo_for_upload', type=str, default=None)
     parser.add_argument('--test_uploaded_model', action='store_true')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -179,6 +186,7 @@ def convert_composer_to_hf(args: Namespace) -> None:
     config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path=args.composer_path,
         output_path=local_folder_path,
+        trust_remote_code=args.trust_remote_code,
         output_precision=args.output_precision,
         local_checkpoint_save_location=args.local_checkpoint_save_location)
 
@@ -206,7 +214,9 @@ def convert_composer_to_hf(args: Namespace) -> None:
     loaded_hf_model.save_pretrained(local_folder_path)
 
     print(f'Loading tokenizer from {local_folder_path}')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
+
+    tokenizer = load_tokenizer(local_folder_path,
+                               trust_remote_code=args.trust_remote_code)
     tokenizer.save_pretrained(local_folder_path)
 
     # Only need to edit files for MPT because it has custom code
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index d2c2a9e1c9..6d5a282993 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -530,6 +530,7 @@ def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -577,6 +578,7 @@ def test_convert_and_generate_triton(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -631,6 +633,7 @@ def test_convert_and_generate_meta(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 

From 789917883f58578df34a62a4895341728098d2be Mon Sep 17 00:00:00 2001
From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Date: Mon, 13 Nov 2023 13:25:44 -0800
Subject: [PATCH 34/49] Enable `tie_word_embeddings` config setting to enable /
 disable weight tied embeddings (#728)

* enable disabling embed weight tying

* fix bug

* updt with descriptive var names

* fix hf config

* move comment with code

* bug fix

* add _tie_weights method

* undo mcli yaml change

* refactor

* add tests

* Update llmfoundry/models/mpt/modeling_mpt.py

Co-authored-by: Sasha Doubov <sasha.doubov@databricks.com>

* pr comments

* updt tests to guard against numerical issues

---------

Co-authored-by: Sasha Doubov <sasha.doubov@databricks.com>
---
 llmfoundry/models/mpt/configuration_mpt.py |  8 ++-
 llmfoundry/models/mpt/modeling_mpt.py      | 72 ++++++++++++++++------
 tests/test_hf_conversion_script.py         | 41 ++++++++----
 tests/test_model.py                        | 72 ++++++++++++++++------
 tests/test_mpt_gen.py                      | 31 +++++++---
 tests/test_onnx.py                         |  5 +-
 6 files changed, 169 insertions(+), 60 deletions(-)

diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index c4ca68d733..c0a1e65248 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -59,6 +59,7 @@ def __init__(
         use_cache: bool = False,
         init_config: Dict = init_config_defaults,
         fc_type: str = 'torch',
+        tie_word_embeddings: bool = True,
         verbose: Optional[int] = None,
         **kwargs: Any,
     ):
@@ -128,6 +129,7 @@ def __init__(
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
             fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
+            tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
         """
         self.d_model = d_model
         self.n_heads = n_heads
@@ -164,7 +166,11 @@ def __init__(
             warnings.warn(
                 f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`'
             )
-        super().__init__(**kwargs)
+        # tie_word_embeddings is set in Huggingface's PretrainedConfig __init__
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 
         self._validate_config()
 
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 0cb3ebd56c..10c042d27c 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -231,10 +231,11 @@ def __init__(self, config: MPTConfig):
         log.debug(self)
         log.debug(f'Using {self.config.init_config["name"]} initialization.')
 
-    def get_input_embeddings(self) -> nn.Embedding:
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
         return self.wte
 
-    def set_input_embeddings(self, value: nn.Embedding) -> None:
+    def set_input_embeddings(
+            self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
         self.wte = value
 
     @torch.no_grad()
@@ -574,14 +575,20 @@ class MPTForCausalLM(MPTPreTrainedModel):
 
     def __init__(self, config: MPTConfig):
         super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError(
-                'MPTForCausalLM only supports tied word embeddings')
-
         log.info(f'Instantiating an MPTForCausalLM model from {__file__}')
 
         self.transformer: MPTModel = MPTModel(config)
 
+        self.lm_head = None
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(
+                config.d_model,
+                config.vocab_size,
+                bias=False,
+                device=config.init_device,
+            )
+            self.lm_head._fsdp_wrap = True
+
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
@@ -602,19 +609,38 @@ def __init__(self, config: MPTConfig):
                     )
             self.logit_scale = logit_scale
 
-    def get_input_embeddings(self) -> nn.Embedding:
-        return self.transformer.wte
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.transformer.get_input_embeddings()
 
     def set_input_embeddings(
             self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
-        self.transformer.wte = value
+        self.transformer.set_input_embeddings(value)
 
-    def get_output_embeddings(self) -> nn.Embedding:
-        return self.transformer.wte
+    def get_output_embeddings(
+            self) -> Union[SharedEmbedding, nn.Embedding, nn.Linear]:
+        if self.lm_head is not None:
+            return self.lm_head
+        return self.transformer.get_input_embeddings()
 
     def set_output_embeddings(
-            self, new_embeddings: Union[SharedEmbedding, nn.Embedding]) -> None:
-        self.transformer.wte = new_embeddings
+        self, new_embeddings: Union[SharedEmbedding, nn.Embedding,
+                                    nn.Linear]) -> None:
+        if self.lm_head is not None:
+            self.lm_head = new_embeddings
+        else:
+            if not isinstance(new_embeddings, (SharedEmbedding, nn.Embedding)):
+                raise ValueError(
+                    'new_embeddings must be an instance of SharedEmbedding ' +
+                    f'or nn.Embedding, but got {type(new_embeddings)}.')
+            warnings.warn(
+                'Using `set_output_embeddings` to set the embedding layer of ' +
+                'MPTForCausalLM with tied weights. Given weights are tied, ' +
+                'using `set_input_embeddings` is recommended over using ' +
+                '`set_output_embeddings`.')
+            self.transformer.set_input_embeddings(new_embeddings)
+
+    def tie_weights(self) -> None:
+        self.lm_head = None
 
     def set_decoder(self, decoder: MPTModel) -> None:
         self.transformer = decoder
@@ -658,12 +684,14 @@ def forward(
             use_cache=use_cache,
         )
 
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        logits = self.transformer.wte(
-            outputs.last_hidden_state.to(self.transformer.wte.weight.device),
-            True,
-        )
+        if self.lm_head is not None:
+            logits = self.lm_head(outputs.last_hidden_state)
+        else:
+            # move outputs to same device as weights for token embedding
+            # needed to support HF `device_map`
+            out = outputs.last_hidden_state
+            out = out.to(self.transformer.wte.weight.device)
+            logits = self.transformer.wte(out, True)
 
         if self.logit_scale is not None:
             if self.logit_scale == 0:
@@ -859,7 +887,11 @@ def flops_per_batch(self, batch: Mapping) -> int:
         # assume the backward pass is approximately 2x the forward pass
 
         bs, msl = batch['input_ids'].shape[0:2]
-        params_flops_per_token = 2 * self.n_active_params
+        params = self.n_active_params
+        if not self.model.transformer.config.tie_word_embeddings:
+            # embedding layers are lookup tables, therefore are not counted in the FLOP computation
+            params -= self.model.transformer.wte.weight.numel()
+        params_flops_per_token = 2 * params
         params_flops_per_seq = params_flops_per_token * msl
         attn_flops_per_seq = (self.model.config.n_layers * 2 * 2 *
                               (self.model.config.d_model * (msl**2)))
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index 6d5a282993..af94126225 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -248,20 +248,21 @@ def test_callback_inits_with_defaults():
 
 @pytest.mark.world_size(2)
 @pytest.mark.gpu
-@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
+@pytest.mark.parametrize(
+    'model,tie_word_embeddings',
+    [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
+)
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
 @pytest.mark.parametrize('log_to_mlflow', [True, False])
 @pytest.mark.parametrize(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
     [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
 @patch('os.cpu_count', MagicMock(return_value=None))
-def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
-                                         fsdp_state_dict_type: Optional[str],
-                                         log_to_mlflow: bool,
-                                         hf_save_interval: str,
-                                         save_interval: str, max_duration: str,
-                                         expected_hf_checkpoints: int,
-                                         expected_normal_checkpoints: int):
+def test_huggingface_conversion_callback(
+        model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
+        fsdp_state_dict_type: Optional[str], log_to_mlflow: bool,
+        hf_save_interval: str, save_interval: str, max_duration: str,
+        expected_hf_checkpoints: int, expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
@@ -298,9 +299,11 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                 'attn_impl': 'torch',
             },
             'loss_fn': 'torch_crossentropy',
+            'tie_word_embeddings': tie_word_embeddings,
         }
         tokenizer_name = 'EleutherAI/gpt-neox-20b'
     elif model == 'neo':
+        assert tie_word_embeddings is None
         model_cfg = {
             'name': 'hf_causal_lm',
             'pretrained_model_name_or_path': 'EleutherAI/gpt-neo-125M',
@@ -313,6 +316,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
         }
         tokenizer_name = 'EleutherAI/gpt-neo-125M'
     elif model == 'llama2':
+        assert tie_word_embeddings is None
         if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
             pytest.skip(
                 'The CI cluster does not have access to the Llama models, so skip this test.'
@@ -489,19 +493,26 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
     delete_transformers_cache()
 
 
-@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
-def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
+@pytest.mark.parametrize(
+    'model,tie_word_embeddings',
+    [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
+)
+def test_convert_and_generate(model: str, tie_word_embeddings: bool,
+                              tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     om_cfg = None
     if model == 'mpt':
         om_cfg = get_config(
             conf_path='scripts/train/yamls/pretrain/testing.yaml')
+        om_cfg['tie_word_embeddings'] = tie_word_embeddings
     elif model == 'neo':
+        assert tie_word_embeddings is None
         om_cfg = get_config(
             conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
         om_cfg['model']['config_overrides']['hidden_size'] = 36
     elif model == 'llama2':
+        assert tie_word_embeddings is None
         if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
             pytest.skip(
                 'The CI cluster does not have access to the Llama models, so skip this test.'
@@ -562,11 +573,14 @@ def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
 
 
 @pytest.mark.gpu
-def test_convert_and_generate_triton(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_convert_and_generate_triton(tie_word_embeddings: str,
+                                     tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     cfg = get_config()
     cfg['model']['init_device'] = 'cpu'
+    cfg['tie_word_embeddings'] = tie_word_embeddings
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'EleutherAI/gpt-neox-20b')
     model = ComposerMPTCausalLM(cfg['model'], tokenizer)
@@ -602,7 +616,9 @@ def test_convert_and_generate_triton(tmp_path: pathlib.Path):
     delete_transformers_cache()
 
 
-def test_convert_and_generate_meta(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_convert_and_generate_meta(tie_word_embeddings: str,
+                                   tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     from composer.utils import dist
@@ -612,6 +628,7 @@ def test_convert_and_generate_meta(tmp_path: pathlib.Path):
     om_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml')
 
     om_cfg['model']['init_device'] = 'cpu'
+    om_cfg['tie_word_embeddings'] = tie_word_embeddings
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         om_cfg.tokenizer.name)
     original_model = COMPOSER_MODEL_REGISTRY[om_cfg['model'].name](
diff --git a/tests/test_model.py b/tests/test_model.py
index 41b62f0ccf..3308c65fd3 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -466,7 +466,8 @@ def test_opt_wrapping():
 
 @pytest.mark.parametrize('norm_type', NORM_CLASS_REGISTRY.keys())
 @pytest.mark.parametrize('no_bias', [False, True])
-def test_mpt_creation(norm_type: str, no_bias: bool):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
     # Test that the config constructs the model as expected.
     hf_config = MPTConfig(
         init_device='cpu',
@@ -482,6 +483,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
         },
         norm_type=norm_type,
         no_bias=no_bias,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
 
@@ -493,6 +495,9 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
 
     assert mpt.transformer.wte.weight.shape == torch.Size(
         [hf_config.vocab_size, hf_config.d_model])
+    if not tie_word_embeddings:
+        assert mpt.lm_head is not None
+        assert mpt.lm_head.weight.shape == mpt.transformer.wte.weight.shape
     assert mpt.transformer.wpe.weight.shape == torch.Size(
         [hf_config.max_seq_len, hf_config.d_model])
     assert mpt.transformer.emb_drop.p == 0.1
@@ -544,8 +549,9 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
         'factor': 1.0,
     },
 }])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_padding(attention_impl: str, device: str,
-                              pos_emb_config: dict):
+                              pos_emb_config: dict, tie_word_embeddings: bool):
     # Test that different placement of padding does not affect the output.
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
@@ -580,6 +586,7 @@ def test_forward_with_padding(attention_impl: str, device: str,
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()
@@ -736,10 +743,13 @@ def test_advanced_mask_building(attention_impl: str):
     assert torch.equal(attn_bias, expected_attn_bias)
 
 
-@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
-                                                   ('flash', 'gpu'),
-                                                   ('triton', 'gpu'),
-                                                   ('torch', 'gpu')])
+@pytest.mark.parametrize('attention_impl,device,precision', [
+    ('torch', 'cpu', 'fp32'),
+    ('flash', 'gpu', 'amp_bf16'),
+    ('triton', 'gpu', 'amp_bf16'),
+    ('torch', 'gpu', 'amp_bf16'),
+    ('torch', 'gpu', 'fp32'),
+])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -766,7 +776,9 @@ def test_advanced_mask_building(attention_impl: str):
         'factor': 1.0,
     },
 }])
-def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_generate(attention_impl: str, device: str, precision: str,
+                  pos_emb_config: dict, tie_word_embeddings: bool):
     # Test that generate works, and produces the same output with or without
     # padding in the input.
     if not torch.cuda.is_available() and device == 'gpu':
@@ -780,6 +792,8 @@ def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
             device != 'gpu' or not is_flash_v2_installed()):
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
+    if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False:
+        pytest.skip(f'This test configuration has precision / sampling issues.')
 
     composer_device = get_device(device)
 
@@ -796,10 +810,11 @@ def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
             'attn_impl': attention_impl,
             **pos_emb_config,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
-    mpt.eval()
     mpt = composer_device.module_to_device(mpt)
+    mpt.eval()
 
     # padding on the left of the input
     left_padding_input_ids = torch.tensor(
@@ -830,8 +845,7 @@ def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
     batched_attention_mask = composer_device.tensor_to_device(
         batched_attention_mask)
 
-    with get_precision_context('amp_bf16' if composer_device.name ==
-                               'gpu' else 'fp32'):
+    with get_precision_context(precision):
         # check that a batch with different amounts of padding doesn't crash
         # and produces the right output shape
         batched_generation = mpt.generate(input_ids=batched_input_ids,
@@ -861,8 +875,9 @@ def test_generate(attention_impl: str, device: str, pos_emb_config: dict):
 @pytest.mark.gpu
 @pytest.mark.parametrize('world_size', [1, 2])
 @pytest.mark.parametrize('use_cache', [False, True])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
-                                  use_cache: bool):
+                                  use_cache: bool, tie_word_embeddings: bool):
     if not torch.cuda.is_available():
         pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
@@ -882,6 +897,7 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
             'attn_impl': 'torch',
         },
         use_cache=use_cache,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.save_pretrained(save_path)
@@ -994,8 +1010,10 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
         'factor': 1.0,
     },
 }])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_cache_and_padding(attn_impl: str, device: str,
-                                        pos_emb_config: dict):
+                                        pos_emb_config: dict,
+                                        tie_word_embeddings: bool):
     # Tests that the result is the same with or without padding when using kv caching
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
@@ -1028,6 +1046,7 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
 
     mpt = MPTForCausalLM(hf_config)
@@ -1133,7 +1152,9 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
         'factor': 1.0,
     },
 }])
-def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
+                            tie_word_embeddings: bool):
     # Test that model forward with and without the key-value cache produces the
     # same output.
     if not torch.cuda.is_available() and device == 'gpu':
@@ -1168,6 +1189,7 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict):
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
@@ -1237,7 +1259,7 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict):
         torch.testing.assert_close(
             second_output.logits,
             full_output.logits[:, -1, :].unsqueeze(1),
-            atol=1e-2,
+            atol=1.1e-2,
             rtol=1e-2,
         )
 
@@ -1274,8 +1296,9 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict):
         'factor': 1.0,
     },
 }])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_past_kv(attn_impl: str, device: str,
-                               pos_emb_config: dict):
+                               pos_emb_config: dict, tie_word_embeddings: bool):
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
@@ -1307,6 +1330,7 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
@@ -1325,7 +1349,8 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
         with mock.patch.object(MPTForCausalLM, 'forward',
                                autospec=True) as forward_mocked:
             forward_mocked.return_value = CausalLMOutputWithPast(
-                logits=torch.randn((1, 3, hf_config.vocab_size)),
+                logits=composer_device.tensor_to_device(
+                    torch.randn((1, 3, hf_config.vocab_size))),
                 past_key_values=[(torch.randn(1, 3, hf_config.d_model),
                                   torch.randn(1, 3, hf_config.d_model))
                                  for _ in range(hf_config.n_layers)])
@@ -1386,9 +1411,11 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
         'factor': 1.0,
     },
 }])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
                                       generation_kwargs: Dict[str, Any],
-                                      pos_emb_config: dict):
+                                      pos_emb_config: dict,
+                                      tie_word_embeddings: bool):
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
             f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
@@ -1417,6 +1444,7 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
             **pos_emb_config,
         },
         use_cache=True,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
@@ -1467,7 +1495,9 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
         'factor': 1.0,
     },
 }])
-def test_model_to(attention_impl: str, pos_emb_config: dict):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_model_to(attention_impl: str, pos_emb_config: dict,
+                  tie_word_embeddings: bool):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
     if not torch.cuda.is_available():
         pytest.skip(
@@ -1498,6 +1528,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict):
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = mpt.bfloat16()
@@ -1600,9 +1631,11 @@ def test_alibi_vs_hf():
 }])
 @pytest.mark.parametrize('output_attentions', [True, False])
 @pytest.mark.parametrize('output_hidden_states', [True, False])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
         attn_impl: str, device: str, pos_emb_config: dict,
-        output_attentions: bool, output_hidden_states: bool):
+        output_attentions: bool, output_hidden_states: bool,
+        tie_word_embeddings: bool):
     # Test that model forward with output_attentions_and_output_hidden_states
     if not torch.cuda.is_available() and device == 'gpu':
         pytest.skip(
@@ -1639,6 +1672,7 @@ def test_forward_with_output_attentions_and_output_hidden_states(
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
diff --git a/tests/test_mpt_gen.py b/tests/test_mpt_gen.py
index c52b765480..413e39bf8c 100644
--- a/tests/test_mpt_gen.py
+++ b/tests/test_mpt_gen.py
@@ -55,9 +55,11 @@ def forward(
 @pytest.mark.gpu
 @pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
 @pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 @patch('llmfoundry.models.mpt.modeling_mpt.MPTForCausalLM',
        new=MockMPTForCausalLM)
 def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
+                                tie_word_embeddings: bool,
                                 build_tiny_mpt: Callable[...,
                                                          ComposerMPTCausalLM],
                                 mpt_tokenizer: PreTrainedTokenizerBase):
@@ -67,11 +69,14 @@ def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
     """
     device = get_device('gpu')
 
-    model = build_tiny_mpt(attn_config={
-        'attn_impl': attn_impl,
-        'attn_uses_sequence_id': False,
-        'alibi': use_alibi
-    },)
+    model = build_tiny_mpt(
+        tie_word_embeddings=tie_word_embeddings,
+        attn_config={
+            'attn_impl': attn_impl,
+            'attn_uses_sequence_id': False,
+            'alibi': use_alibi
+        },
+    )
     model = device.module_to_device(model)
 
     model.eval()
@@ -88,13 +93,25 @@ def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
 
 
 @pytest.mark.gpu
-def test_mpt_generate_callback(build_tiny_mpt: Callable[...,
+@pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
+@pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_mpt_generate_callback(attn_impl: str, use_alibi: bool,
+                               tie_word_embeddings: bool,
+                               build_tiny_mpt: Callable[...,
                                                         ComposerMPTCausalLM],
                                tiny_ft_dataloader: DataLoader):
     device = get_device('gpu')
 
     # build mpt model
-    model = build_tiny_mpt()
+    model = build_tiny_mpt(
+        tie_word_embeddings=tie_word_embeddings,
+        attn_config={
+            'attn_impl': attn_impl,
+            'attn_uses_sequence_id': False,
+            'alibi': use_alibi
+        },
+    )
     model = device.module_to_device(model)
 
     # generate callback
diff --git a/tests/test_onnx.py b/tests/test_onnx.py
index d0e01746eb..becd3c773f 100644
--- a/tests/test_onnx.py
+++ b/tests/test_onnx.py
@@ -3,6 +3,7 @@
 
 import pathlib
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM
 
@@ -25,7 +26,8 @@ def gen_random_batch(batch_size: int, vocab_size: int, max_seq_len: int):
     return batch
 
 
-def test_onnx_export(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_onnx_export(tie_word_embeddings: bool, tmp_path: pathlib.Path):
     from transformers.models.auto.configuration_auto import CONFIG_MAPPING
     CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
     AutoModelForCausalLM.register(MPTConfig, MPTForCausalLM)
@@ -48,6 +50,7 @@ def test_onnx_export(tmp_path: pathlib.Path):
         use_cache=True,
         vocab_size=vocab_size,
         norm_type='layernorm',
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()

From 8ba697cec6560fa8adaddc779b6d3ed2ff4adb36 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Mon, 13 Nov 2023 14:13:23 -0800
Subject: [PATCH 35/49] add act checkpoint at sub layer level (#720)

* add act checkpoint at sub layer level

* Update llmfoundry/models/mpt/modeling_mpt.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* address comments

* addess coments

* add log info

* fix pyright

* refactor

* better log info and error msg

* add test

* Update llmfoundry/models/mpt/modeling_mpt.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* remove unneeded comments

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/models/mpt/modeling_mpt.py | 34 ++++++++++++-
 tests/test_fsdp_act_checkpoint.py     | 73 +++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_fsdp_act_checkpoint.py

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 10c042d27c..274c1b76e5 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -45,7 +45,9 @@
 from transformers.models.llama.modeling_llama import \
     LlamaRotaryEmbedding as HFRotaryEmbedding
 
-from llmfoundry.models.layers.attention import attn_bias_shape, build_attn_bias
+from llmfoundry.models.layers.attention import (ATTN_CLASS_REGISTRY,
+                                                attn_bias_shape,
+                                                build_attn_bias)
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.layers.custom_embedding import SharedEmbedding
 from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY as FC_CLASS_REGISTRY
@@ -733,7 +735,35 @@ def fsdp_wrap_fn(self, module: nn.Module) -> bool:
 
     # Activation Checkpointing
     def activation_checkpointing_fn(self, module: nn.Module) -> bool:
-        return isinstance(module, MPTBlock)
+        act_ckpt_list = getattr(self.config, 'activation_checkpointing_target',
+                                None) or ['MPTBlock']
+
+        if 'MPTBlock' in act_ckpt_list or 'mptblock' in act_ckpt_list:
+            if len(act_ckpt_list) > 1:
+                log.info(
+                    'Activation checkpointing MPTBlock only (ignoring other sub-block modules specified in activation_checkpointing_target).'
+                )
+            return isinstance(module, MPTBlock)
+
+        mod_types = ()
+        for mod_name in act_ckpt_list:
+            if mod_name.lower() == 'mptblock':
+                mod_types += (MPTBlock,)
+            elif mod_name in ATTN_CLASS_REGISTRY:
+                mod_types += (ATTN_CLASS_REGISTRY[mod_name],)
+            elif mod_name in FFN_CLASS_REGISTRY:
+                mod_types += (FFN_CLASS_REGISTRY[mod_name],)
+            elif mod_name in NORM_CLASS_REGISTRY:
+                mod_types += (NORM_CLASS_REGISTRY[mod_name],)
+            else:
+                msg = ', '.join(
+                    list(ATTN_CLASS_REGISTRY.keys()) +
+                    list(FFN_CLASS_REGISTRY.keys()) +
+                    list(NORM_CLASS_REGISTRY.keys()) + ['MPTBlock'])
+                raise ValueError(
+                    f'{mod_name} (specified in activation_checkpointing_target) is not a recognized option out of available options {msg}.'
+                )
+        return isinstance(module, mod_types)
 
     def prepare_inputs_for_generation(
         self,
diff --git a/tests/test_fsdp_act_checkpoint.py b/tests/test_fsdp_act_checkpoint.py
new file mode 100644
index 0000000000..1a46fcbccd
--- /dev/null
+++ b/tests/test_fsdp_act_checkpoint.py
@@ -0,0 +1,73 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from composer import Trainer
+from composer.utils import get_device
+from omegaconf import OmegaConf as om
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \
+    CheckpointWrapper
+
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+@pytest.mark.parametrize('activation_checkpointing', [True, False])
+@pytest.mark.parametrize(
+    'activation_checkpointing_target',
+    [[], ['grouped_query_attention'], ['mptblock', 'grouped_query_attention']])
+def test_fsdp_act_checkpoint(activation_checkpointing: bool,
+                             activation_checkpointing_target: list):
+    device = get_device('gpu')
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 2,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_type': 'grouped_query_attention',
+            'kv_n_heads': 2,
+        },
+        'activation_checkpointing_target': activation_checkpointing_target
+    }
+    model_cfg = om.create(model_cfg)
+
+    fsdp_config = {
+        'activation_checkpointing': activation_checkpointing,
+        'activation_checkpointing_reentrant': False,
+        'activation_cpu_offload': False,
+    }
+
+    model = ComposerMPTCausalLM(model_cfg)
+    model = device.module_to_device(model)
+
+    trainer = Trainer(
+        model=model,
+        device='gpu',
+        fsdp_config=fsdp_config,
+    )
+
+    assert trainer.state.fsdp_enabled
+    if not activation_checkpointing:
+        assert not isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0], CheckpointWrapper)
+    elif (not activation_checkpointing_target
+         ) or activation_checkpointing_target == [
+             'mptblock', 'grouped_query_attention'
+         ]:
+        assert isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0]._fsdp_wrapped_module, CheckpointWrapper)
+    elif activation_checkpointing_target == ['grouped_query_attention']:
+        assert isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0]._fsdp_wrapped_module.attn, CheckpointWrapper)
+    else:
+        raise ValueError(
+            f'Unknown activation_checkpointing_target: {activation_checkpointing_target}'
+        )

From d1960f2ca842397bcb39d1bd13139b363c21641e Mon Sep 17 00:00:00 2001
From: snarayan21 <narayan.saaketh@gmail.com>
Date: Mon, 13 Nov 2023 14:35:41 -0800
Subject: [PATCH 36/49] Better defaults for StreamingDataset subclasses (#723)

---
 llmfoundry/data/denoising.py             |  6 +++---
 llmfoundry/data/finetuning/dataloader.py |  6 +++---
 llmfoundry/data/finetuning/tasks.py      | 22 +++++++++++---------
 llmfoundry/data/text_data.py             | 26 ++++++++++++++----------
 setup.py                                 |  2 +-
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py
index 7d497b4efd..8ccf7f25e9 100644
--- a/llmfoundry/data/denoising.py
+++ b/llmfoundry/data/denoising.py
@@ -477,13 +477,13 @@ def build_text_denoising_dataloader(
         remote=cfg.dataset.get('remote'),
         split=cfg.dataset.get('split'),
         shuffle=cfg.dataset.get('shuffle', False),
-        predownload=cfg.dataset.get('predownload', 100_000),
+        predownload=cfg.dataset.get('predownload', None),
         keep_zip=cfg.dataset.get('keep_zip', False),
         download_retry=cfg.dataset.get('download_retry', 2),
         download_timeout=cfg.dataset.get('download_timeout', 60),
-        validate_hash=cfg.dataset.get('validate_hash'),
+        validate_hash=cfg.dataset.get('validate_hash', None),
         shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
-        num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', 128),
+        num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None),
         batch_size=device_batch_size,
     )
 
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 44d6d345f5..b19cab841f 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -136,13 +136,13 @@ def build_finetuning_dataloader(cfg: DictConfig,
             epoch_size=cfg.dataset.get('epoch_size', None),
             predownload=cfg.dataset.get('predownload', None),
             cache_limit=cfg.dataset.get('cache_limit', None),
-            partition_algo=cfg.dataset.get('partition_algo', 'orig'),
+            partition_algo=cfg.dataset.get('partition_algo', 'relaxed'),
             num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None),
             batch_size=device_batch_size,
             shuffle=cfg.dataset.get('shuffle', False),
-            shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1b'),
+            shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1e'),
             shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
-            shuffle_block_size=cfg.dataset.get('shuffle_block_size', 1 << 18),
+            shuffle_block_size=cfg.dataset.get('shuffle_block_size', None),
             sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
             sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
             batching_method=cfg.dataset.get('batching_method', 'random'),
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 6ba6ad96c8..bc712a7504 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -88,12 +88,12 @@ class StreamingFinetuningDataset(StreamingDataset):
         keep_zip (bool): Whether to keep or delete the compressed form when decompressing
             downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
             `False``.
-        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
             streams. If ``None``, takes its value from the total number of underlying samples.
             Provide this field if you are weighting streams relatively to target a larger or
             smaller epoch size. Defaults to ``None``.
         predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
         cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
             shard cache. Before downloading a shard, the least recently used resident shard(s) may
             be evicted (deleted from the local cache) in order to stay under the limit. Set to None
@@ -101,15 +101,17 @@ class StreamingFinetuningDataset(StreamingDataset):
             bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
         partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
         num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
-            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
-            initial run.
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
         batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
             partitioned over the workers. Defaults to ``None``.
         shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
             ``False``.
-        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
         shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
-        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        shuffle_block_size (int): Unit of shuffle. If ``None``, its value is calculated as
+            ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to ``None``.
         sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
             Defaults to ``balanced``.
         sampling_granularity (int): When picking samples for a stream's final partial repeat,
@@ -129,16 +131,16 @@ def __init__(self,
                  download_timeout: float = 60,
                  validate_hash: Optional[str] = None,
                  keep_zip: bool = False,
-                 epoch_size: Optional[int] = None,
+                 epoch_size: Optional[Union[int, str]] = None,
                  predownload: Optional[int] = None,
                  cache_limit: Optional[Union[int, str]] = None,
-                 partition_algo: str = 'orig',
+                 partition_algo: str = 'relaxed',
                  num_canonical_nodes: Optional[int] = None,
                  batch_size: Optional[int] = None,
                  shuffle: bool = False,
-                 shuffle_algo: str = 'py1b',
+                 shuffle_algo: str = 'py1e',
                  shuffle_seed: int = 9176,
-                 shuffle_block_size: int = 1 << 18,
+                 shuffle_block_size: Optional[int] = None,
                  sampling_method: str = 'balanced',
                  sampling_granularity: int = 1,
                  batching_method: str = 'random',
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
index 93af2f63ed..51fd6b38dc 100644
--- a/llmfoundry/data/text_data.py
+++ b/llmfoundry/data/text_data.py
@@ -46,12 +46,12 @@ class StreamingTextDataset(StreamingDataset):
         keep_zip (bool): Whether to keep or delete the compressed form when decompressing
             downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
             `False``.
-        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
             streams. If ``None``, takes its value from the total number of underlying samples.
             Provide this field if you are weighting streams relatively to target a larger or
             smaller epoch size. Defaults to ``None``.
         predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
         cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
             shard cache. Before downloading a shard, the least recently used resident shard(s) may
             be evicted (deleted from the local cache) in order to stay under the limit. Set to None
@@ -59,15 +59,19 @@ class StreamingTextDataset(StreamingDataset):
             bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
         partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
         num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
-            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
-            initial run.
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
         batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
             partitioned over the workers. Defaults to ``None``.
         shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
             ``False``.
-        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
         shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
-        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        shuffle_block_size (int, optional): Unit of shuffle. A canonical node's samples are split
+            into blocks of this size, and samples within each block are shuffled. If ``None``, its
+            value is calculated as ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to
+            ``None``.
         sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
             Defaults to ``balanced``.
         sampling_granularity (int): When picking samples for a stream's final partial repeat,
@@ -89,16 +93,16 @@ def __init__(self,
                  download_timeout: float = 60,
                  validate_hash: Optional[str] = None,
                  keep_zip: bool = False,
-                 epoch_size: Optional[int] = None,
-                 predownload: int = 100_000,
+                 epoch_size: Optional[Union[int, str]] = None,
+                 predownload: Optional[int] = None,
                  cache_limit: Optional[Union[int, str]] = None,
-                 partition_algo: str = 'orig',
+                 partition_algo: str = 'relaxed',
                  num_canonical_nodes: Optional[int] = None,
                  batch_size: Optional[int] = None,
                  shuffle: bool = False,
-                 shuffle_algo: str = 'py1b',
+                 shuffle_algo: str = 'py1e',
                  shuffle_seed: int = 9176,
-                 shuffle_block_size: int = 1 << 18,
+                 shuffle_block_size: Optional[int] = None,
                  sampling_method: str = 'balanced',
                  sampling_granularity: int = 1,
                  batching_method: str = 'random',
diff --git a/setup.py b/setup.py
index 81178686d2..05d1d1bbbe 100644
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,7 @@
     'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17',
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
     'transformers>=4.34.1,<4.35',
-    'mosaicml-streaming>=0.6,<0.7',
+    'mosaicml-streaming>=0.7.1,<0.8',
     'torch>=1.13.1,<2.1.1',
     'datasets>=2.14.5,<2.15',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data

From 753feff96801a8959d22477b7857422076f6b4dc Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Mon, 13 Nov 2023 22:34:18 -0800
Subject: [PATCH 37/49] Rename log message (#734)

---
 llmfoundry/callbacks/hf_checkpointer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index e02bf03693..788a8943b1 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -204,7 +204,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                         state_dict[k] = v.to(dtype=self.dtype)
 
             if dist.get_global_rank() == 0:
-                log.debug('Saving Hugging Face checkpoint to disk')
+                log.debug('Saving Hugging Face checkpoint in global rank 0')
 
                 copied_config = copy.deepcopy(original_model.config)
                 if copied_config.model_type == 'mpt':

From 45113ebf4ef2ad3714c1a9b51d9cca79bcafb921 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 14 Nov 2023 10:48:56 -0800
Subject: [PATCH 38/49] remove tokenizer_name field (#735)

---
 scripts/inference/benchmarking/yamls/1b.yaml        | 1 -
 scripts/inference/benchmarking/yamls/7b.yaml        | 1 -
 scripts/train/yamls/pretrain/gpt-neo-125m.yaml      | 2 --
 scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml | 2 --
 scripts/train/yamls/pretrain/gpt2-small.yaml        | 2 --
 scripts/train/yamls/pretrain/opt-3b.yaml            | 2 --
 6 files changed, 10 deletions(-)

diff --git a/scripts/inference/benchmarking/yamls/1b.yaml b/scripts/inference/benchmarking/yamls/1b.yaml
index f94aa3d806..d1cfb3c913 100644
--- a/scripts/inference/benchmarking/yamls/1b.yaml
+++ b/scripts/inference/benchmarking/yamls/1b.yaml
@@ -12,7 +12,6 @@ tokenizer:
 model:
   name: mpt_causal_lm
   init_device: cpu
-  tokenizer_name: ${tokenizer_name}
   d_model: 2048
   n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
   n_layers: 24
diff --git a/scripts/inference/benchmarking/yamls/7b.yaml b/scripts/inference/benchmarking/yamls/7b.yaml
index 55e9ae8413..f57ed2657f 100644
--- a/scripts/inference/benchmarking/yamls/7b.yaml
+++ b/scripts/inference/benchmarking/yamls/7b.yaml
@@ -12,7 +12,6 @@ tokenizer:
 model:
   name: mpt_causal_lm
   init_device: cpu
-  tokenizer_name: ${tokenizer_name}
   d_model: 4096
   n_heads: 32
   n_layers: 32
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
index cfb447e2e4..12914e14bc 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
index fc1e3b0b7f..3da239c717 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/gpt2-small.yaml b/scripts/train/yamls/pretrain/gpt2-small.yaml
index dde59d55b1..d40cff6e9e 100644
--- a/scripts/train/yamls/pretrain/gpt2-small.yaml
+++ b/scripts/train/yamls/pretrain/gpt2-small.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/opt-3b.yaml b/scripts/train/yamls/pretrain/opt-3b.yaml
index 3ac281f0ea..4423784b54 100644
--- a/scripts/train/yamls/pretrain/opt-3b.yaml
+++ b/scripts/train/yamls/pretrain/opt-3b.yaml
@@ -27,7 +27,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -40,7 +39,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false

From f114dad550d82c82fc763262fb73be62a21ba810 Mon Sep 17 00:00:00 2001
From: Sasha Doubov <sasha@mosaicml.com>
Date: Wed, 15 Nov 2023 08:48:39 -0800
Subject: [PATCH 39/49] Fix pairwise attention comparison in test (#737)

---
 tests/test_flash_triton_torch.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index 3f2c229d6d..1ede36c0b5 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -74,7 +74,7 @@ def test_attn_impl(attn_impl_0: str,
 
     cfg = om.create({
         'attn_impl': 'flash',
-        'd_model': 128,
+        'd_model': 64,
         'n_heads': 4,
         'attn_pdrop': 0,
         'clip_qkv': clip_qkv,
@@ -88,6 +88,7 @@ def test_attn_impl(attn_impl_0: str,
 
     cfg.attn_impl = attn_impl_0
     attn0 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+    cfg.attn_impl = attn_impl_1
     attn1 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
 
     attn1.load_state_dict(attn0.state_dict())
@@ -182,7 +183,15 @@ def gen_bias(attn_impl: str):
         assert p.grad is not None
         assert tp.grad is not None
         assert allclose_helper(p, tp)
-        assert allclose_helper(p.grad, tp.grad)
+
+        using_hf_rope = pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'hf'
+
+        # special case that (likely) fails due to numerics
+        if clip_qkv and qk_ln and using_hf_rope and attn_type == 'grouped_query_attention':
+            assert allclose_helper(p.grad, tp.grad, atol=2.e-2, rtol=2.e-2)
+        else:
+            assert allclose_helper(p.grad, tp.grad)
 
     assert x0.grad is not None
     assert x1.grad is not None

From db279d092befc38f8219c0a3bffb1542681c034a Mon Sep 17 00:00:00 2001
From: Wenfei Yan <87323464+wenfeiy-db@users.noreply.github.com>
Date: Wed, 15 Nov 2023 11:03:16 -0800
Subject: [PATCH 40/49] Fix passed metadata to mlflow logging (#713)

---
 llmfoundry/callbacks/hf_checkpointer.py | 14 +++++------
 llmfoundry/utils/builders.py            |  5 +++-
 tests/test_builders.py                  | 32 ++++++++++++++++++++++++
 tests/test_hf_conversion_script.py      | 33 ++++++++++++++++++++-----
 4 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 788a8943b1..c79537c781 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -74,12 +74,13 @@ def __init__(
         if self.mlflow_registered_model_name is not None:
             # Both the metadata and the task are needed in order for mlflow
             # and databricks optimized model serving to work
-            if 'metadata' not in mlflow_logging_config:
-                mlflow_logging_config['metadata'] = {
-                    'task': 'llm/v1/completions'
-                }
-            if 'task' not in mlflow_logging_config:
-                mlflow_logging_config['task'] = 'text-generation'
+            default_metadata = {'task': 'llm/v1/completions'}
+            passed_metadata = mlflow_logging_config.get('metadata', {})
+            mlflow_logging_config['metadata'] = {
+                **default_metadata,
+                **passed_metadata
+            }
+            mlflow_logging_config.setdefault('task', 'text-generation')
         self.mlflow_logging_config = mlflow_logging_config
 
         self.huggingface_folder_name_fstr = os.path.join(
@@ -93,7 +94,6 @@ def __init__(
         self.save_interval = save_interval
         self.check_interval = create_interval_scheduler(
             save_interval, include_end_of_training=True)
-
         self.remote_ud = maybe_create_remote_uploader_downloader_from_uri(
             save_folder, loggers=[])
         if self.remote_ud is not None:
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 142e714b55..dedf6f5434 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -73,7 +73,8 @@ def build_icl_data_and_gauntlet(
     return icl_evaluators, logger_keys, eval_gauntlet_cb
 
 
-def build_callback(name: str, kwargs: Dict[str, Any]) -> Callback:
+def build_callback(name: str, kwargs: Union[DictConfig, Dict[str,
+                                                             Any]]) -> Callback:
     if name == 'lr_monitor':
         return LRMonitor()
     elif name == 'memory_monitor':
@@ -117,6 +118,8 @@ def build_callback(name: str, kwargs: Dict[str, Any]) -> Callback:
     elif name == 'early_stopper':
         return EarlyStopper(**kwargs)
     elif name == 'hf_checkpointer':
+        if isinstance(kwargs, DictConfig):
+            kwargs = om.to_object(kwargs)  # pyright: ignore
         return HuggingFaceCheckpointer(**kwargs)
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
diff --git a/tests/test_builders.py b/tests/test_builders.py
index 0d24d2154f..237e27b52b 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -6,8 +6,10 @@
 
 import pytest
 from composer.callbacks import Generate
+from omegaconf import OmegaConf as om
 from transformers import PreTrainedTokenizerBase
 
+from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
 from llmfoundry.utils.builders import build_callback, build_tokenizer
 
@@ -78,3 +80,33 @@ def test_build_generate_callback_unspecified_interval():
                 'foo': 'bar',
                 'something': 'else',
             })
+
+
+def test_build_hf_checkpointer_callback():
+    with mock.patch.object(HuggingFaceCheckpointer,
+                           '__init__') as mock_hf_checkpointer:
+        mock_hf_checkpointer.return_value = None
+        save_folder = 'path_to_save_folder'
+        save_interval = 1
+        mlflow_logging_config_dict = {
+            'metadata': {
+                'databricks_model_family': 'MptForCausalLM',
+                'databricks_model_size_parameters': '7b',
+                'databricks_model_source': 'mosaic-fine-tuning',
+                'task': 'llm/v1/completions'
+            }
+        }
+        build_callback(name='hf_checkpointer',
+                       kwargs=om.create({
+                           'save_folder': save_folder,
+                           'save_interval': save_interval,
+                           'mlflow_logging_config': mlflow_logging_config_dict
+                       }))
+
+        assert mock_hf_checkpointer.call_count == 1
+        _, _, kwargs = mock_hf_checkpointer.mock_calls[0]
+        assert kwargs['save_folder'] == save_folder
+        assert kwargs['save_interval'] == save_interval
+        assert isinstance(kwargs['mlflow_logging_config'], dict)
+        assert isinstance(kwargs['mlflow_logging_config']['metadata'], dict)
+        assert kwargs['mlflow_logging_config'] == mlflow_logging_config_dict
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index af94126225..dcb743b536 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -5,7 +5,7 @@
 import os
 import pathlib
 import sys
-from unittest.mock import MagicMock, patch
+from unittest.mock import ANY, MagicMock, patch
 
 from composer import Trainer
 from composer.loggers import MLFlowLogger
@@ -242,9 +242,22 @@ def get_config(
     return cast(DictConfig, test_cfg)
 
 
-def test_callback_inits_with_defaults():
+def test_callback_inits():
+    # test with defaults
     _ = HuggingFaceCheckpointer(save_folder='test', save_interval='1ba')
 
+    # test default metatdata when mlflow registered name is given
+    hf_checkpointer = HuggingFaceCheckpointer(
+        save_folder='test',
+        save_interval='1ba',
+        mlflow_registered_model_name='test_model_name')
+    assert hf_checkpointer.mlflow_logging_config == {
+        'task': 'text-generation',
+        'metadata': {
+            'task': 'llm/v1/completions'
+        }
+    }
+
 
 @pytest.mark.world_size(2)
 @pytest.mark.gpu
@@ -425,10 +438,18 @@ def test_huggingface_conversion_callback(
     trainer.fit()
 
     if dist.get_global_rank() == 0:
-        assert mlflow_logger_mock.save_model.call_count == (1 if log_to_mlflow
-                                                            else 0)
-        assert mlflow_logger_mock.register_model.call_count == (
-            1 if log_to_mlflow else 0)
+        if log_to_mlflow:
+            assert mlflow_logger_mock.save_model.call_count == 1
+            mlflow_logger_mock.save_model.assert_called_with(
+                flavor='transformers',
+                transformers_model=ANY,
+                path=ANY,
+                task='text-generation',
+                metadata={'task': 'llm/v1/completions'})
+            assert mlflow_logger_mock.register_model.call_count == 1
+        else:
+            assert mlflow_logger_mock.save_model.call_count == 0
+            assert mlflow_logger_mock.register_model.call_count == 0
     else:
         assert mlflow_logger_mock.log_model.call_count == 0
         assert mlflow_logger_mock.register_model.call_count == 0

From e7962187b4397a22a0a63625b1af955a9a2424df Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 15 Nov 2023 19:09:13 -0500
Subject: [PATCH 41/49] fix script (#741)

---
 scripts/inference/hf_generate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py
index 96592ca477..45ddc6b63e 100644
--- a/scripts/inference/hf_generate.py
+++ b/scripts/inference/hf_generate.py
@@ -217,6 +217,7 @@ def main(args: Namespace) -> None:
         if device is not None:
             print(f'Placing model on {device=}...')
             model.to(device)
+        model.to(model_dtype)
     except Exception as e:
         raise RuntimeError(
             'Unable to load HF model. ' +

From e730995c4f0dfdaf9d9d547783739eec48880edb Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 15 Nov 2023 20:53:18 -0800
Subject: [PATCH 42/49] Bump to composer 0.17 (#736)

---
 setup.py              |  6 ++---
 tests/test_mpt_gen.py | 55 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 05d1d1bbbe..afdfce8d48 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17',
+    'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17,<0.18',
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
     'transformers>=4.34.1,<4.35',
     'mosaicml-streaming>=0.7.1,<0.8',
@@ -84,11 +84,11 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]',
+    'mosaicml[databricks]>=0.17,<0.18',
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.16.1,<0.17',
+    'mosaicml[tensorboard]>=0.17,<0.18',
 ]
 
 extra_deps['gpu'] = [
diff --git a/tests/test_mpt_gen.py b/tests/test_mpt_gen.py
index 413e39bf8c..9f022ef487 100644
--- a/tests/test_mpt_gen.py
+++ b/tests/test_mpt_gen.py
@@ -95,9 +95,7 @@ def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
 @pytest.mark.gpu
 @pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
 @pytest.mark.parametrize('use_alibi', [True, False])
-@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_mpt_generate_callback(attn_impl: str, use_alibi: bool,
-                               tie_word_embeddings: bool,
                                build_tiny_mpt: Callable[...,
                                                         ComposerMPTCausalLM],
                                tiny_ft_dataloader: DataLoader):
@@ -105,7 +103,7 @@ def test_mpt_generate_callback(attn_impl: str, use_alibi: bool,
 
     # build mpt model
     model = build_tiny_mpt(
-        tie_word_embeddings=tie_word_embeddings,
+        tie_word_embeddings=True,
         attn_config={
             'attn_impl': attn_impl,
             'attn_uses_sequence_id': False,
@@ -143,3 +141,54 @@ def test_mpt_generate_callback(attn_impl: str, use_alibi: bool,
 
     generate.generate.assert_called_once()
     trainer.logger.log_table.assert_called_once()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
+@pytest.mark.parametrize('use_alibi', [True, False])
+def test_mpt_generate_callback_not_tied(
+        use_alibi: bool, attn_impl: str,
+        build_tiny_mpt: Callable[..., ComposerMPTCausalLM],
+        tiny_ft_dataloader: DataLoader):
+    device = get_device('gpu')
+
+    # build mpt model
+    model = build_tiny_mpt(
+        tie_word_embeddings=False,
+        attn_config={
+            'attn_impl': attn_impl,
+            'attn_uses_sequence_id': False,
+            'alibi': use_alibi,
+        },
+    )
+    model = device.module_to_device(model)
+
+    # generate callback
+    prompts = [
+        'The best banana bread recipe is',
+        '2+2=',
+        'how much wood could a woodchuck chuck',
+    ]
+    gen_interval = 1
+    generate = ComposerGenerate(
+        prompts,
+        interval=f'{gen_interval}ba',
+        max_new_tokens=5,
+        batch_size=len(prompts),
+        use_cache=True,
+    )
+    generate.generate = Mock(wraps=generate.generate, autospec=True)
+
+    # build trainer
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        device=device,
+        max_duration=f'{gen_interval}ba',
+        callbacks=[generate],
+    )
+    trainer.logger.log_table = Mock()
+    trainer.fit()
+
+    generate.generate.assert_called_once()
+    trainer.logger.log_table.assert_called_once()

From 25bb63f128e55477f1da2cf45d7c4118453b9206 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 17 Nov 2023 11:25:20 -0800
Subject: [PATCH 43/49] Patch os cpu count to avoid extra multiprocessing
 inside pytest which sometimes hangs (#745)

---
 tests/fixtures/data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
index 39032146b6..16dd01347d 100644
--- a/tests/fixtures/data.py
+++ b/tests/fixtures/data.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 from composer.utils import dist
 from omegaconf import DictConfig
@@ -25,6 +26,7 @@ def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path:
 
 
 @fixture
+@patch('os.cpu_count', MagicMock(return_value=None))
 def tiny_ft_dataloader(tiny_ft_dataset_path: Path,
                        mpt_tokenizer: PreTrainedTokenizerBase,
                        max_seq_len: int = 128,

From 269ded6d074410845cba08d495b4e54dd673d1cf Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 17 Nov 2023 23:07:06 -0800
Subject: [PATCH 44/49] Reenable tests that were accidentally disabled (#746)

---
 tests/test_model.py | 187 ++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 112 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 3308c65fd3..c160c064dc 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -304,17 +304,13 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2):
     assert not torch.equal(original_params, updated_params)
 
 
+@pytest.mark.gpu
 @pytest.mark.parametrize(
     'attn_impl,precision',
     [('torch', torch.float16), ('torch', torch.bfloat16),
      pytest.param('flash', torch.float16, marks=pytest.mark.gpu),
      pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)])
 def test_determinism(attn_impl: str, precision: torch.dtype):
-    if not torch.cuda.is_available():
-        pytest.skip(
-            'This test requires CUDA to be available in order to run with bfloat16 precision.'
-        )
-
     conf_path = 'scripts/train/yamls/pretrain/testing.yaml'
     with open(conf_path) as f:
         test_cfg = om.load(f)
@@ -519,10 +515,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
         assert block.resid_ffn_dropout.p == 0.2
 
 
-@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
-                                                   ('flash', 'gpu'),
-                                                   ('triton', 'gpu'),
-                                                   ('torch', 'gpu')])
+@pytest.mark.parametrize('attention_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu)
+])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -550,24 +548,20 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_padding(attention_impl: str, device: str,
-                              pos_emb_config: dict, tie_word_embeddings: bool):
+def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
+                              tie_word_embeddings: bool):
     # Test that different placement of padding does not affect the output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
     rope = pos_emb_config['rope']
-    if rope and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if rope and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -743,12 +737,12 @@ def test_advanced_mask_building(attention_impl: str):
     assert torch.equal(attn_bias, expected_attn_bias)
 
 
-@pytest.mark.parametrize('attention_impl,device,precision', [
-    ('torch', 'cpu', 'fp32'),
-    ('flash', 'gpu', 'amp_bf16'),
-    ('triton', 'gpu', 'amp_bf16'),
-    ('torch', 'gpu', 'amp_bf16'),
-    ('torch', 'gpu', 'fp32'),
+@pytest.mark.parametrize('attention_impl,precision', [
+    ('torch', 'fp32'),
+    pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'fp32', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -777,25 +771,21 @@ def test_advanced_mask_building(attention_impl: str):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate(attention_impl: str, device: str, precision: str,
-                  pos_emb_config: dict, tie_word_embeddings: bool):
+def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
+                  tie_word_embeddings: bool):
     # Test that generate works, and produces the same output with or without
     # padding in the input.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
     if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False:
         pytest.skip(f'This test configuration has precision / sampling issues.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -878,8 +868,6 @@ def test_generate(attention_impl: str, device: str, precision: str,
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
                                   use_cache: bool, tie_word_embeddings: bool):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
@@ -978,11 +966,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     check_hf_model_equivalence(mpt, mpt2)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1011,22 +999,17 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache_and_padding(attn_impl: str, device: str,
-                                        pos_emb_config: dict,
+def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
                                         tie_word_embeddings: bool):
     # Tests that the result is the same with or without padding when using kv caching
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1120,11 +1103,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
                 rtol=1e-6)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1153,23 +1136,19 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
+def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
                             tie_word_embeddings: bool):
     # Test that model forward with and without the key-value cache produces the
     # same output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1264,11 +1243,11 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
         )
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1297,20 +1276,16 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate_with_past_kv(attn_impl: str, device: str,
-                               pos_emb_config: dict, tie_word_embeddings: bool):
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
+def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
+                               tie_word_embeddings: bool):
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1368,11 +1343,11 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
                                                              hf_config.d_model)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,
@@ -1412,24 +1387,22 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
+def test_generation_kwargs_dont_crash(attn_impl: str,
                                       generation_kwargs: Dict[str, Any],
                                       pos_emb_config: dict,
                                       tie_word_embeddings: bool):
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
-    composer_device = get_device(device)
-    if device == 'gpu':  # Switch deteminism off
+    composer_device = get_device(None)
+
+    if composer_device.name == 'gpu':
         torch.use_deterministic_algorithms(False)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1463,7 +1436,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
         _ = mpt.generate(input_ids=no_padding_input_ids,
                          attention_mask=no_padding_attention_mask,
                          **generation_kwargs)
-    if device == 'gpu':  # Switch deteminism back on
+
+    if composer_device.name == 'gpu':
         reproducibility.configure_deterministic_mode()
 
 
@@ -1499,10 +1473,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
 def test_model_to(attention_impl: str, pos_emb_config: dict,
                   tie_word_embeddings: bool):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
-    if not torch.cuda.is_available():
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
@@ -1597,11 +1567,11 @@ def test_alibi_vs_hf():
             torch.testing.assert_close(alibi_bias_hf, alibi_bias_m)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1633,24 +1603,19 @@ def test_alibi_vs_hf():
 @pytest.mark.parametrize('output_hidden_states', [True, False])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, device: str, pos_emb_config: dict,
-        output_attentions: bool, output_hidden_states: bool,
-        tie_word_embeddings: bool):
+        attn_impl: str, pos_emb_config: dict, output_attentions: bool,
+        output_hidden_states: bool, tie_word_embeddings: bool):
     # Test that model forward with output_attentions_and_output_hidden_states
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if output_attentions and attn_impl in ['flash', 'triton']:
         pytest.skip(f'output_attentions only implemented with torch attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     n_layers = 2
 
@@ -1708,8 +1673,6 @@ def test_hf_init(tmp_path: pathlib.Path,
                  init_device: str,
                  world_size: int,
                  batch_size: int = 1):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 

From e547a28d6e128f6777360e0b8662226ea43f4ff7 Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Sun, 19 Nov 2023 01:03:42 +0000
Subject: [PATCH 45/49] Add shapes test

---
 llmfoundry/models/layers/blocks.py         |  2 +
 llmfoundry/models/mpt/configuration_mpt.py | 15 ------
 llmfoundry/models/mpt/modeling_mpt.py      |  5 +-
 tests/test_model.py                        | 58 ++++++++++++++++++++++
 4 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index 6605807c6b..2bd678ddb1 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -35,6 +35,8 @@
         'type': 'no_scaling',
         'factor': 1.0,
     },
+    'tensor_parallel_qkvo': False,
+    'tp_world_size': None,
 }
 
 
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index a30ee655dd..0df6f7c29a 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -22,21 +22,6 @@
     'ffn_type': 'mptmlp',
 }
 
-attn_config_defaults: Dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'tensor_parallel_qkvo': False,
-    'tp_world_size': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8,
-}
-
 init_config_defaults: Dict = {
     'name': 'kaiming_normal_',
     'fan_mode': 'fan_in',
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index dfb26967ae..e1ed15f520 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -8,7 +8,7 @@
 
 import math
 import warnings
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Dict, List, Mapping, MutableMapping, Optional, Tuple,
                     Union)
 
@@ -38,7 +38,7 @@
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor import (DeviceMesh, Shard, distribute_module,
                                        distribute_tensor)
-from torch.distributed.tensor.parallel import (ColwiseParallel, RowwiseParallel,
+from torch.distributed.tensor.parallel import (RowwiseParallel,
                                                make_input_replicate_1d,
                                                make_sharded_output_tensor,
                                                parallelize_module)
@@ -266,7 +266,6 @@ def __init__(self, config: MPTConfig):
                 mesh_dim_names=['ep', 'tp'],
             )
             new_blocks = nn.ModuleList()
-            torch.set_printoptions(profile='full', sci_mode=False)
             for block in self.blocks:
                 qkv_module = block.get_submodule('attn.Wqkv')
                 oned_mesh = _create_1d_device_mesh(twod_mesh, tp_mesh_dim=1)
diff --git a/tests/test_model.py b/tests/test_model.py
index c160c064dc..8c6c78e2d3 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -12,7 +12,9 @@
 import pytest
 import torch
 import torch.nn as nn
+from torch.distributed._tensor.api import DTensor
 from accelerate import init_empty_weights
+from composer import Trainer
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
@@ -1800,3 +1802,59 @@ def test_head_dim_8_triton_mqa_attn(batch_size: int = 2):
         output = model(batch)
 
     assert not torch.isnan(output.logits).any()
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+def test_tp_qkvo():
+    local_world_size = dist.get_local_world_size()
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'init_device': 'cpu',
+        'd_model': 128,
+        'n_heads': 4, # head size 32
+        'n_layers': 2,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_type': 'multihead_attention',
+            'alibi': False,
+            'attn_impl': 'torch',
+            'tensor_parallel_qkvo': True,
+            'tp_world_size': local_world_size 
+        }
+    }
+
+    model_cfg = om.create(model_cfg)
+    fsdp_config = {
+       'sharding_strategy': 'NO_SHARD',
+       'mixed_precision': 'DEFAULT'
+    }
+
+    model = COMPOSER_MODEL_REGISTRY[model_cfg.name](model_cfg)
+
+    # The trainer is used to wrap the model in FSDP, which can be used
+    # alongside with TP for 2D parallelism
+    trainer = Trainer(
+        model=model,
+        fsdp_config=fsdp_config,
+    )
+
+    transformer_blocks = model.model.transformer.blocks
+    for block in transformer_blocks:
+        attn_module = block._fsdp_wrapped_module.attn
+
+        # Check that all attention module weights are DTensors
+        assert isinstance(attn_module.Wqkv.weight, DTensor)
+        assert isinstance(attn_module.out_proj.weight, DTensor)
+
+        Wqkv_local = attn_module.Wqkv.weight._local_tensor
+        out_proj_local = attn_module.out_proj.weight._local_tensor
+
+        # Wqkv is colwise-sharded, so its output dimension (dim 0 since torch
+        # stores everything along the transpose) is sharded along the device mesh
+        assert Wqkv_local.shape[0] * local_world_size == model_cfg.d_model * 3
+
+        # The out projection is row-wise sharded, so its input dimension (dim 1)
+        # is sharded along the device mesh
+        assert out_proj_local.shape[1] * local_world_size == model_cfg.d_model
\ No newline at end of file

From daf5a4a31faa5802a7b6dc798335801bccc883e1 Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Sun, 19 Nov 2023 01:49:41 +0000
Subject: [PATCH 46/49] Add weight test

---
 tests/test_model.py | 59 +++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 8c6c78e2d3..df46fd18b6 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1806,8 +1806,14 @@ def test_head_dim_8_triton_mqa_attn(batch_size: int = 2):
 @pytest.mark.world_size(2)
 @pytest.mark.gpu
 def test_tp_qkvo():
+    # Note: we need the RNG state in this test to ensure that weights
+    # are initialized with the same values in both models. Without it,
+    # even with a random seed, the weights will be different since the
+    # RNG state changes with each init.
+    rng_state = reproducibility.get_rng_state()
+
     local_world_size = dist.get_local_world_size()
-    model_cfg = {
+    sharded_model_cfg = {
         'name': 'mpt_causal_lm',
         'init_device': 'cpu',
         'd_model': 128,
@@ -1825,36 +1831,59 @@ def test_tp_qkvo():
         }
     }
 
-    model_cfg = om.create(model_cfg)
+    # Create the same model config, but with TP turned off
+    full_model_cfg = copy.deepcopy(sharded_model_cfg)
+    full_model_cfg['attn_config']['tensor_parallel_qkvo'] = False
+    del full_model_cfg['attn_config']['tp_world_size']
+
+    sharded_model_cfg = om.create(sharded_model_cfg)
+    full_model_cfg = om.create(full_model_cfg)
+
+    sharded_model = COMPOSER_MODEL_REGISTRY[sharded_model_cfg.name](sharded_model_cfg)
+    reproducibility.load_rng_state(rng_state)
+    
+    full_model = COMPOSER_MODEL_REGISTRY[full_model_cfg.name](full_model_cfg)
+    reproducibility.load_rng_state(rng_state)
+
     fsdp_config = {
        'sharding_strategy': 'NO_SHARD',
        'mixed_precision': 'DEFAULT'
     }
-
-    model = COMPOSER_MODEL_REGISTRY[model_cfg.name](model_cfg)
-
     # The trainer is used to wrap the model in FSDP, which can be used
     # alongside with TP for 2D parallelism
     trainer = Trainer(
-        model=model,
+        model=sharded_model,
+        fsdp_config=fsdp_config,
+        seed=0
+    )
+
+    trainer = Trainer(
+        model=full_model,
         fsdp_config=fsdp_config,
+        seed=0
     )
 
-    transformer_blocks = model.model.transformer.blocks
-    for block in transformer_blocks:
-        attn_module = block._fsdp_wrapped_module.attn
+    sharded_transformer_blocks = sharded_model.model.transformer.blocks
+    full_transformer_blocks = full_model.model.transformer.blocks
+    for sharded_block, full_block in zip(sharded_transformer_blocks, full_transformer_blocks):
+        sharded_attn_module = sharded_block._fsdp_wrapped_module.attn
+        full_attn_module = full_block._fsdp_wrapped_module.attn
 
         # Check that all attention module weights are DTensors
-        assert isinstance(attn_module.Wqkv.weight, DTensor)
-        assert isinstance(attn_module.out_proj.weight, DTensor)
+        assert isinstance(sharded_attn_module.Wqkv.weight, DTensor)
+        assert isinstance(sharded_attn_module.out_proj.weight, DTensor)
 
-        Wqkv_local = attn_module.Wqkv.weight._local_tensor
-        out_proj_local = attn_module.out_proj.weight._local_tensor
+        Wqkv_local = sharded_attn_module.Wqkv.weight._local_tensor
+        out_proj_local = sharded_attn_module.out_proj.weight._local_tensor
 
         # Wqkv is colwise-sharded, so its output dimension (dim 0 since torch
         # stores everything along the transpose) is sharded along the device mesh
-        assert Wqkv_local.shape[0] * local_world_size == model_cfg.d_model * 3
+        assert Wqkv_local.shape[0] * local_world_size == sharded_model_cfg.d_model * 3
 
         # The out projection is row-wise sharded, so its input dimension (dim 1)
         # is sharded along the device mesh
-        assert out_proj_local.shape[1] * local_world_size == model_cfg.d_model
\ No newline at end of file
+        assert out_proj_local.shape[1] * local_world_size == sharded_model_cfg.d_model
+    
+        # Check that the sharded output weights are the same as the full model
+        # weights 
+        assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, :out_proj_local.shape[1]])
\ No newline at end of file

From 372255c212ac13f2db8683ff7724bb575d31b79e Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Sun, 19 Nov 2023 01:53:50 +0000
Subject: [PATCH 47/49] tests actually pass now

---
 tests/test_model.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index df46fd18b6..82461de7f3 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1885,5 +1885,9 @@ def test_tp_qkvo():
         assert out_proj_local.shape[1] * local_world_size == sharded_model_cfg.d_model
     
         # Check that the sharded output weights are the same as the full model
-        # weights 
-        assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, :out_proj_local.shape[1]])
\ No newline at end of file
+        # weights - rank 0 should have the top half and rank 1 should have the
+        # bottom half
+        if dist.get_local_rank() == 0:
+            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, :out_proj_local.shape[1]])
+        else:
+            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, out_proj_local.shape[1]:])
\ No newline at end of file

From d5bba2e820d34ef2c6d43fff4c196d91fe17a282 Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Sun, 19 Nov 2023 01:54:51 +0000
Subject: [PATCH 48/49] get rid of unnecessary rng stage calls

---
 tests/test_model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 82461de7f3..41a7bc852b 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1843,7 +1843,6 @@ def test_tp_qkvo():
     reproducibility.load_rng_state(rng_state)
     
     full_model = COMPOSER_MODEL_REGISTRY[full_model_cfg.name](full_model_cfg)
-    reproducibility.load_rng_state(rng_state)
 
     fsdp_config = {
        'sharding_strategy': 'NO_SHARD',
@@ -1854,13 +1853,11 @@ def test_tp_qkvo():
     trainer = Trainer(
         model=sharded_model,
         fsdp_config=fsdp_config,
-        seed=0
     )
 
     trainer = Trainer(
         model=full_model,
         fsdp_config=fsdp_config,
-        seed=0
     )
 
     sharded_transformer_blocks = sharded_model.model.transformer.blocks

From 6a2b18a4b0efb9ff561458957d8318a23511d2d0 Mon Sep 17 00:00:00 2001
From: Linden Li <linden.li@databricks.com>
Date: Sun, 19 Nov 2023 02:12:38 +0000
Subject: [PATCH 49/49] Add other weight test

---
 tests/test_model.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 41a7bc852b..8ec19bb27b 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -33,6 +33,7 @@
 from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
+from llmfoundry.models.mpt.modeling_mpt import rearrange_tensor
 from llmfoundry.utils import build_tokenizer
 
 
@@ -1877,14 +1878,24 @@ def test_tp_qkvo():
         # stores everything along the transpose) is sharded along the device mesh
         assert Wqkv_local.shape[0] * local_world_size == sharded_model_cfg.d_model * 3
 
-        # The out projection is row-wise sharded, so its input dimension (dim 1)
+        # The out projection is rowwise-sharded, so its input dimension (dim 1)
         # is sharded along the device mesh
         assert out_proj_local.shape[1] * local_world_size == sharded_model_cfg.d_model
     
+        Wqkv_interleaved = rearrange_tensor(
+            full_attn_module.Wqkv.weight, 
+            local_world_size,
+            sharded_model_cfg.d_model,
+            sharded_model_cfg.d_model // sharded_model_cfg.n_heads,
+            sharded_model_cfg.n_heads
+        )
         # Check that the sharded output weights are the same as the full model
-        # weights - rank 0 should have the top half and rank 1 should have the
-        # bottom half
+        # weights:
+        #   rank 0 should have the top half of out proj and the left half of Wqkv
+        #   rank 1 should have the bottom half of out proj and the right half of Wqkv
         if dist.get_local_rank() == 0:
             assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, :out_proj_local.shape[1]])
+            assert torch.equal(Wqkv_local, Wqkv_interleaved[:Wqkv_local.shape[0], :])
         else:
-            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, out_proj_local.shape[1]:])
\ No newline at end of file
+            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, out_proj_local.shape[1]:])
+            assert torch.equal(Wqkv_local, Wqkv_interleaved[Wqkv_local.shape[0]:, :])
\ No newline at end of file