diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 83c9a63884..13a835356c 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -29,7 +29,12 @@ jobs:
         - name: '2.1.0_cu121_flash2'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           dep_groups: '[gpu-flash2]'
-
+        - name: '2.1.0_cu121_aws'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
+          dep_groups: '[gpu]'
+        - name: '2.1.0_cu121_flash2_aws'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
+          dep_groups: '[gpu-flash2]'
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 1151837111..ffbfac4585 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -40,7 +40,7 @@ jobs:
     if: github.repository_owner == 'mosaicml'
     with:
       container: ${{ matrix.container }}
-      mcloud-timeout: 1200
+      mcloud-timeout: 1800
       name: ${{ matrix.name }}
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
diff --git a/README.md b/README.md
index 04bad9c519..4a4e60e844 100644
--- a/README.md
+++ b/README.md
@@ -45,15 +45,15 @@ You'll find in this repo:
 Mosaic Pretrained Transformers (MPT) are GPT-style models with some special features -- Flash Attention for efficiency, ALiBi for context length extrapolation, and stability improvements to mitigate loss spikes. As part of MosaicML's Foundation series, we have open-sourced several MPT models:
 
 
-| Model              | Context Length | Download                                           | Demo                                                             | Commercial use? |
-|--------------------|----------------|----------------------------------------------------|------------------------------------------------------------------|-----------------|
-| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                                  | Yes             |
-| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                                  | Yes             |
-| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat)      | No              |
-| MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                                  | Yes             |
-| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                                  | Yes             |
-| MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)       | No              |
-| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                                  | Yes             |
+| Model              | Context Length | Download                                           | Demo                                                        | Commercial use? |
+| ------------------ | -------------- | -------------------------------------------------- | ----------------------------------------------------------- | --------------- |
+| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                             | Yes             |
+| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                             | Yes             |
+| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat) | No              |
+| MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                             | Yes             |
+| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                             | Yes             |
+| MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)  | No              |
+| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                             | Yes             |
 
 To try out these models locally, [follow the instructions](https://github.com/mosaicml/llm-foundry/tree/main/scripts/inference#interactive-generation-with-modelgenerate) in `scripts/inference/README.md` to prompt HF models using our [hf_generate.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_generate.py) or [hf_chat.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_chat.py) scripts.
 
@@ -89,17 +89,17 @@ This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems w
 This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems.
 If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix!
 
-| Device                    | Torch Version    | Cuda Version | Status                        |
-|---------------------------|------------------|--------------|-------------------------------|
-| A100-40GB/80GB            | 1.13.1           | 11.7         | :white_check_mark: Supported  |
-| A100-40GB/80GB            | 2.0.1            | 11.7, 11.8   | :white_check_mark: Supported  |
-| A100-40GB/80GB            | 2.1.0            | 11.8, 12.1   | :white_check_mark: Supported  |
-| H100-80GB                 | 1.13.1           | 11.7         | :x: Not Supported             |
-| H100-80GB                 | 2.0.1            | 11.8         | :white_check_mark: Supported  |
-| H100-80GB                 | 2.1.0            | 12.1         | :white_check_mark: Supported  |
-| A10-24GB                  | 1.13.1           | 11.7         | :construction: In Progress    |
-| A10-24GB                  | 2.0.1            | 11.7, 11.8   | :construction: In Progress    |
-| MI250                     | 2.0.1            | ROCm 5.4     | :construction: In Progress    |
+| Device         | Torch Version | Cuda Version | Status                       |
+| -------------- | ------------- | ------------ | ---------------------------- |
+| A100-40GB/80GB | 1.13.1        | 11.7         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.0.1         | 11.7, 11.8   | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.1.0         | 11.8, 12.1   | :white_check_mark: Supported |
+| H100-80GB      | 1.13.1        | 11.7         | :x: Not Supported            |
+| H100-80GB      | 2.0.1         | 11.8         | :white_check_mark: Supported |
+| H100-80GB      | 2.1.0         | 12.1         | :white_check_mark: Supported |
+| A10-24GB       | 1.13.1        | 11.7         | :construction: In Progress   |
+| A10-24GB       | 2.0.1         | 11.7, 11.8   | :construction: In Progress   |
+| MI250          | 2.0.1         | ROCm 5.4     | :construction: In Progress   |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
@@ -111,15 +111,17 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 
 **Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.
 
-| Docker Image                                                | Torch Version  | Cuda Version | LLM Foundry dependencies installed? |
-|-------------------------------------------------------------|----------------|--------------|-------------------------------------|
-| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04`      | 1.13.1         | 11.7         | No                                  |
-| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`       | 2.0.1          | 11.8         | No                                  |
-| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`       | 2.1.0          | 12.1         | No                                  |
-| `mosaicml/llm-foundry:1.13.1_cu117-latest`                  | 1.13.1         | 11.7         | Yes                                 |
-| `mosaicml/llm-foundry:2.0.1_cu118-latest`                   | 2.0.1          | 11.8         | Yes                                 |
-| `mosaicml/llm-foundry:2.1.0_cu121-latest`                   | 2.1.0          | 12.1         | Yes (flash attention v1)            |
-| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`            | 2.1.0          | 12.1         | Yes (flash attention v2)            |
+| Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
+| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
+| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1        | 11.7 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`  | 2.0.1         | 11.8 (Infiniband) | No                                  |
+| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
+| `mosaicml/llm-foundry:1.13.1_cu117-latest`             | 1.13.1        | 11.7 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.0.1_cu118-latest`              | 2.0.1         | 11.8 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest`   | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v2)            |
 
 
 # Installation
@@ -181,14 +183,14 @@ source llmfoundry-venv-amd/bin/activate
 
 # installs
 pip install cmake packaging torch
-pip install -e .  # this installs some things which are not needed but they dont hurt
+pip install -e .  # This installs some things that are not needed but they don't hurt
 pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2
 ```
 **Lastly**, install the ROCm enabled flash attention (instructions [here](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2#amd-gpurocm-support)).
 
 Notes:
 1. `attn_impl: triton` does not work.
-1. We don't yet have a docker img where everything works perfectly. You might need to up/down grade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
+1. We don't yet have a docker img where everything works perfectly. You might need to up/downgrade some packages (in our case, we needed to downgrade to `numpy==1.23.5`) before everything works without issue.
 
 # Quickstart
 
@@ -228,7 +230,7 @@ python inference/convert_composer_to_hf.py \
   # --hf_repo_for_upload user-org/repo-name
 
 # Evaluate the model on a subset of tasks
-python eval/eval.py \
+composer eval/eval.py \
   eval/yamls/hf_eval.yaml \
   icl_tasks=eval/yamls/copa.yaml \
   model_name_or_path=mpt-125m-hf
diff --git a/TUTORIAL.md b/TUTORIAL.md
index 36993bc409..86bd9829e9 100644
--- a/TUTORIAL.md
+++ b/TUTORIAL.md
@@ -8,27 +8,42 @@ Forging LLMs can be quite complicated — you have to get your data prepared, se
 
 This tutorial will provide a brief intro to the repo’s structure and underlying tools (all courtesy of MosaicML, of course), will go over a few example workflows and point you to the related resources within the repo, and will finally cover a number of FAQs that we have encountered since release.
 
+- [LLM Foundry Tutorial](#llm-foundry-tutorial)
 - [Intro](#intro)
   - [How this repo is structured](#how-this-repo-is-structured)
   - [Key components](#key-components)
+    - [Composer](#composer)
+    - [StreamingDataset](#streamingdataset)
+    - [MCLI](#mcli)
   - [How the YAMLs work](#how-the-yamls-work)
 - [Example Workflows](#example-workflows)
   - [Workflow 1: I want to play with a HF model like MPT-7B locally](#workflow-1-i-want-to-play-with-a-hf-model-like-mpt-7b-locally)
   - [Workflow 2: I want to deploy an inference endpoint with a HF model like MPT-7B](#workflow-2-i-want-to-deploy-an-inference-endpoint-with-a-hf-model-like-mpt-7b)
   - [Workflow 3: I want to finetune a HF model like MPT-7B](#workflow-3-i-want-to-finetune-a-hf-model-like-mpt-7b)
+    - [Supervised FineTuning and Instruction FineTuning](#supervised-finetuning-and-instruction-finetuning)
+    - [Domain Adaptation and Sequence Length Adaptation](#domain-adaptation-and-sequence-length-adaptation)
+      - [Data](#data)
+      - [Modeling](#modeling)
   - [Workflow 4: I want to train a new HF model from scratch](#workflow-4-i-want-to-train-a-new-hf-model-from-scratch)
 - [FAQs](#faqs)
-  - [Why is the script only using 1 out of N GPUs?](#why-is-the-script-only-using-1-out-of-n-gpus)
-  - [I’m running into an Out-Of-Memory (OOM) error. What do I do?](#im-running-into-an-out-of-memory-oom-error-what-do-i-do)
-  - [What hardware can I train on?](#what-hardware-can-i-train-on)
-  - [What hardware can I run eval on?](#what-hardware-can-i-run-eval-on)
-  - [What is FSDP?](#what-is-fsdp)
-  - [What are the different attention options `torch` / `flash` / `triton`  for MPT and which one should I use?](#what-are-the-different-attention-options-torch--flash--triton-for-mpt-and-which-one-should-i-use)
-  - [Can I finetune using PEFT / LORA?](#can-i-finetune-using-peft--lora)
-  - [Can I quantize these models and/or run on CPU?](#can-i-quantize-these-models-andor-run-on-cpu)
-  - [How do I deploy with ONNX/FasterTransformer?](#how-do-i-deploy-with-onnxfastertransformer)
-  - [How expensive is it to build LLMs?](#how-expensive-is-it-to-build-llms)
-  - [Common installation issues](#common-installation-issues)
+    - [Why is the script only using 1 out of N GPUs?](#why-is-the-script-only-using-1-out-of-n-gpus)
+    - [I’m running into an Out-Of-Memory (OOM) error. What do I do?](#im-running-into-an-out-of-memory-oom-error-what-do-i-do)
+    - [What hardware can I train on?](#what-hardware-can-i-train-on)
+    - [What hardware can I run eval on?](#what-hardware-can-i-run-eval-on)
+    - [What hardware can I run inference on?](#what-hardware-can-i-run-inference-on)
+    - [What is FSDP?](#what-is-fsdp)
+    - [What are the different attention options `torch` / `flash` / `triton`  for MPT and which one should I use?](#what-are-the-different-attention-options-torch--flash--triton--for-mpt-and-which-one-should-i-use)
+      - [Limitations](#limitations)
+      - [What is `triton-pre-mlir`?](#what-is-triton-pre-mlir)
+      - [Known issue with sm86+ GPUs](#known-issue-with-sm86-gpus)
+      - [Support for FlashAttention-2](#support-for-flashattention-2)
+    - [What kinds of positional embeddings does LLM Foundry support?](#what-kinds-of-positional-embeddings-does-llm-foundry-support)
+    - [Can I finetune using PEFT / LoRA?](#can-i-finetune-using-peft--lora)
+    - [Can I quantize these models and/or run on CPU?](#can-i-quantize-these-models-andor-run-on-cpu)
+    - [How do I deploy with ONNX/FasterTransformer?](#how-do-i-deploy-with-onnxfastertransformer)
+    - [TransformerEngine and amp\_fp8 support](#transformerengine-and-amp_fp8-support)
+    - [How expensive is it to build LLMs?](#how-expensive-is-it-to-build-llms)
+    - [Common installation issues](#common-installation-issues)
 
 Let’s get started!
 
@@ -68,7 +83,7 @@ The Trainer is a pytorch-native object that composes your model, dataset(s), opt
 Spending some time understanding the Composer Trainer is a great way to form a deeper understanding of what the train and eval scripts are doing under the hood.
 
 Composer also comes packaged with the `composer` launcher.
-If you go through our docs, you'll notice that we instruct you to launch the train script (`scripts/train/train.py`) and eval script (`scripts/eval/eval.py`) using the launcher, like so,
+If you go through our docs, you'll notice that we instruct you to launch the training script (`scripts/train/train.py`) and eval script (`scripts/eval/eval.py`) using the launcher, like so,
 
 <!--pytest.mark.skip-->
 ```bash
@@ -81,7 +96,7 @@ The `composer` launcher puts all your GPUs to work by launching the script on a
 ### StreamingDataset
 
 The training script contains logic for building a few different types of dataloaders used for different training tasks.
-Each of these dataloaders are built to work with **streaming datasets**.
+Each of these dataloaders is built to work with **streaming datasets**.
 There are a number of benefits that come from using streaming datasets, from fast, deterministic resumption to easily loading from a mixture of streams at once.
 
 The scripts in `scripts/data_prep/` are your one-stop-shop for converting a local dataset or a dataset on the Hugging Face Hub to our streaming MDS format.
@@ -178,7 +193,7 @@ We address two possible versions of “finetuning” here. For both, you’ll wa
 
 ### Supervised FineTuning and Instruction FineTuning
 
-`scripts/train/` already includes some resources for supervised finetuning. If that’s what you’re interestested in check out
+`scripts/train/` already includes some resources for supervised finetuning. If that’s what you’re interested in check out
 
 1. [**LLM Finetuning from a Local Dataset: A Concrete Example**](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/finetune_example/README.md)
 2. [The YAML which should replicate the process of creating MPT-7B-Instruct from MPT-7b](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml) — You can point this at your own dataset by [following these instructions](https://github.com/mosaicml/llm-foundry/blob/main/scripts/train/README.md#Usage)
@@ -228,7 +243,7 @@ After you're done training, you probably want to convert your Composer checkpoin
 > **Note**
 > Pretraining for 10s of billions of tokens is a large job even for a smaller model; you’ll want multiple A100s for this example.
 
-It is conceivable that you would like to train a model *with the same architecture* as a model available in HuggingFace `transformers` but without using those same weights; for example, if you have a large amount of proprietary data, or want to change something about the model that is hard to change after the fact. So, as an example, let’s say you want a version of `gpt2`  but with longer sequence length, say 2048. Using the MPT architecture would give us Flash Attention and ALiBi, allowing us to go much longer; but for this example we stick with 2048. And of course, let’s use 150 tokens/parameter, which is the ratio that MPT-7B used, getting us to 17.55B tokens for our 117M param model.
+It is conceivable that you would like to train a model *with the same architecture* as a model available in HuggingFace `transformers` but without using those same weights; for example, if you have a large amount of proprietary data, or want to change something about the model that is hard to change after the fact. So, as an example, let’s say you want a version of `gpt2`  but with a longer sequence length, say 2048. Using the MPT architecture would give us Flash Attention and ALiBi, allowing us to go much longer; but for this example we stick with 2048. And of course, let’s use 150 tokens/parameter, which is the ratio that MPT-7B used, getting us to 17.55B tokens for our 117M param model.
 
 The first step to training from scratch is to get your pretraining data prepared.  Following [the data preparation README](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/README.md), we convert C4 as follows:
 
@@ -294,25 +309,25 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
 
 - **Long answer:** In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are bandwidth (BW) limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) showed that fusing all operations in Softmax Attention can make the operation much less BW limited.
-Furthermore, integrating a recompuation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.
+Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.
 
   - Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
   - Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
-  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experiance, `triton` is slightly faster than `flash`.
+  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
 
-<!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, durring training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
+<!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) noted that fusing all operations in Softmax Attention can make the operation much less BW limited.
-Furthermore, integrating a recompuation schema decreases the sequence length memory complexity from quadratic to linear enabling practitioners to train transformer networks using much longer sequence lengths.
+Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from quadratic to linear enabling practitioners to train transformer networks using much longer sequence lengths.
 
 Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
 Setting `attn_config.attn_impl=flash` enables flash attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
-Setting `attn_config.attn_impl=triton` enables a flash attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experiance, `triton` is slightly faster than `flash`.
+Setting `attn_config.attn_impl=triton` enables a flash attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
 The majority of our training setups use `triton`. -->
 
 #### Limitations
 - For training, `torch` uses a lot of memory and is slow.
-- `flash` and `triton` cannot return attention weights and therefore cannot be used with methods which require it.
-- `flash` cannot accept an attention bias and therefore cannot be used with methods which require it such as ALiBi.
+- `flash` and `triton` cannot return attention weights and therefore cannot be used with methods that require it.
+- `flash` cannot accept an attention bias and therefore cannot be used with methods that require it such as ALiBi.
 
 #### What is `triton-pre-mlir`?
 - Torch2 installs and requires a specific version of [Triton](https://openai.com/research/triton).
@@ -328,6 +343,18 @@ The majority of our training setups use `triton`. -->
   Updating to LLVM14 (or LLVM15) cannot be done because there are breaking changes.
   What is the result of this? Although sm89+ is not **formally** supported until LLVM15, our testing on H100 GPUs shows that `attn_impl=triton` still works well and still runs fast. The only issue is that when the network is starting to run, LLVM might throw a warning like: `'sm_90' is not a recognized processor for this target (ignoring processor)`. This warning does not seem to affect performance.
 
+#### Support for FlashAttention-2
+- [FlashAttention-2](https://arxiv.org/pdf/2307.08691.pdf) improves upon FlashAttention to get even faster attention computation. LLM Foundry supports FlashAttention-2. Please follow the instructions [here](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#flashattention).
+
+### What kinds of positional embeddings does LLM Foundry support?
+Currently we support [Learned Positional Embeddings](https://arxiv.org/pdf/1706.03762.pdf), [Attention with Linear Biases (ALiBi)](https://arxiv.org/pdf/2108.12409.pdf), and [Rotary Positional Embeddings (RoPE)](https://arxiv.org/pdf/2104.09864.pdf). There is also an option to switch off all of these embeddings to get [No Positional Embedding](https://arxiv.org/pdf/2203.16634.pdf).
+
+| Name                               | YAML Config                                                       | Training MFU on MPT-7B trained on 8 A100 80GB GPUs | Notes                                                                                                                                                                       |
+|:-----------------------------------|:------------------------------------------------------------------|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Learned Positional Embeddings      | <pre>model:<br>     learned_pos_emb:&nbsp;True</pre>| 65.7                                                |                                                                                                                                                                             |
+| ALiBi                              | <pre>model:<br>     attn_config:<br>         alibi:&nbsp;True</pre>| 64.5                                                |  Requires Triton or Torch attention.                                                                                                                                        |
+| RoPE (Dao-AILab Implementation)    | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;dail</pre>| 64.5                                                | Requires a CUDA GPU and the [flash-attn library](https://github.com/Dao-AILab/flash-attention) v2.0.1 or higher to be installed. Please see the instructions in the [paragraph above](#support-for-flashattention-2) on how to install flash-attn v2. Note that the attention implementation can still be `torch`, `triton`, or `flash`. |
+| RoPE (Hugging<code>&nbsp;</code>Face Implementation)  | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;hf</pre>| 62.3                                                |                                                                                                                                                                             |
 
 ### Can I finetune using PEFT / LoRA?
 - The LLM Foundry codebase does not directly have examples of PEFT or LORA workflows. However, our MPT model is a subclass of HuggingFace `PretrainedModel`, and https://github.com/mosaicml/llm-foundry/pull/346 added required features to enable HuggingFace’s [PEFT](https://huggingface.co/docs/peft/index) / [LORA](https://huggingface.co/docs/peft/conceptual_guides/lora) workflows for MPT. MPT models with LoRA modules can be trained either using LLM Foundry or Hugging Face's [accelerate](https://huggingface.co/docs/accelerate/index). Within LLM Foundry, run (`scripts/train/train.py`), adding `lora` arguments to the config `.yaml`, like so:
@@ -370,7 +397,7 @@ model:
 ```
 enables [TransformerEngine's LayerNormMLP](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.LayerNormMLP) layer which enables sequence parallelism if configured correctly.
 
-WARNING: `state_dicts` generated with `ffn_type: te_ln_mlp` will NOT directly map to `state_dicts` generated using the default network configurations. We do not have control over how `te.LayerNormMLP` is implemented and therefore cannot reasily reconcile it with the default implementation (or any other implementation).
+WARNING: `state_dicts` generated with `ffn_type: te_ln_mlp` will NOT directly map to `state_dicts` generated using the default network configurations. We do not have control over how `te.LayerNormMLP` is implemented and therefore cannot readily reconcile it with the default implementation (or any other implementation).
 
 ### How expensive is it to build LLMs?
 - Check out our blog post [GPT3-Quality for <$500k](https://www.mosaicml.com/blog/gpt-3-quality-for-500k) for guidance on LLM training times and costs.
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index 3bb9eed043..51fa67993a 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -4,6 +4,11 @@
 import torch
 
 try:
+    # Before importing any transformers models, we need to disable transformers flash attention if
+    # we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+    # gated import otherwise.
+    import transformers
+
     from llmfoundry import optim, utils
     from llmfoundry.data import (ConcatTokensDataset,
                                  MixtureOfDenoisersCollator, NoConcatDataset,
@@ -14,8 +19,8 @@
                                       ComposerHFT5)
     from llmfoundry.models.layers.attention import (
         MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
-        flash_attn_fn, scaled_multihead_dot_product_attention,
-        triton_flash_attn_fn)
+        flash_attn_fn, is_flash_v1_installed,
+        scaled_multihead_dot_product_attention, triton_flash_attn_fn)
     from llmfoundry.models.layers.blocks import MPTBlock
     from llmfoundry.models.layers.ffn import (FFN_CLASS_REGISTRY, MPTMLP,
                                               build_ffn)
@@ -24,6 +29,8 @@
                                        MPTForCausalLM, MPTModel,
                                        MPTPreTrainedModel)
     from llmfoundry.tokenizers import TiktokenTokenizerWrapper
+    if is_flash_v1_installed():
+        transformers.utils.is_flash_attn_available = lambda: False
 
 except ImportError as e:
     try:
diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py
index 78ccbb529b..7281a8d1fc 100644
--- a/llmfoundry/callbacks/eval_gauntlet_callback.py
+++ b/llmfoundry/callbacks/eval_gauntlet_callback.py
@@ -22,6 +22,32 @@ class Weighting(Enum):
     LOG_SAMPLE_SZ = 3
 
 
+def calculate_named_averages(average_names: Dict[str, list],
+                             category_scores: Dict[str, float]):
+    """Calculates the named averages based off the raw category scores.
+
+    For each named average, take a simple average of all the category scores associated with that named average.
+
+    Args:
+        average_names (dict[str, list]):  Contains a mapping of named averages to which category scores that average should consist of.
+        category_scores (dict[str, float]): Contains the raw scores corresponding to each category.
+    """
+    average_scores = {}
+    for avg_name, category_list in average_names.items():
+        composite_subset = {
+            category: score
+            for category, score in category_scores.items()
+            if category in category_list
+        }
+        if len(composite_subset.values()) > 0:
+            average_scores[avg_name] = sum(composite_subset.values()) / len(
+                composite_subset.values())
+        else:
+            average_scores[avg_name] = 0
+
+    return average_scores
+
+
 class EvalGauntlet(Callback):
     """The EvalGauntlet aggregates ICL eval results.
 
@@ -31,7 +57,7 @@ class EvalGauntlet(Callback):
     Args:
         logger_keys (list): These are the exact keys that the individual benchmark metrics will be
                             logged under in the logger after eval
-        tasks (dict): This contains the list of categories, as well as the subtasks within them, the
+        categories (dict): This contains the list of categories, as well as the subtasks within them, the
                       random baseline accuracy of each subtask, and the number of fewshot examples
                       used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
         weighting (Weighting): The weighting scheme used to balance different tasks within each category.
@@ -43,6 +69,7 @@ class EvalGauntlet(Callback):
         rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark
                                  by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
         benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
+        averages (Optional[dict]): Optional dictionary specifying a mapping from a average names to lists of categories used produce each named average.
     """
 
     def __init__(self,
@@ -51,7 +78,8 @@ def __init__(self,
                  weighting: str = 'EQUAL',
                  subtract_random_baseline: bool = True,
                  rescale_accuracy: bool = True,
-                 benchmark_sizes: Optional[dict] = None):
+                 benchmark_sizes: Optional[dict] = None,
+                 averages: Optional[dict] = None):
         if isinstance(logger_keys, dict):
             raise ValueError(
                 'logger_keys now requires a list type as input, not a dict')
@@ -66,13 +94,12 @@ def __init__(self,
             )
 
         self.categories = categories
+        self.category_names = [conf.get('name') for conf in self.categories]
         self.weighting = Weighting[weighting]
         self.subtract_random_baseline = subtract_random_baseline
         self.rescale_accuracy = rescale_accuracy
         self.logger_keys = logger_keys
-
         for category in self.categories:
-
             for benchmark in category['benchmarks']:
                 bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
 
@@ -95,7 +122,20 @@ def __init__(self,
                 assert weight is not None
                 benchmark['weighting'] = weight
 
-    def compute_averages(self, state: State) -> Dict[str, float]:
+        self.averages = {}
+        if averages is not None:
+            self.averages = averages
+        else:
+            # if no averages spec provided, simply average everything
+            self.averages['default_average'] = self.category_names
+
+        for avg_name in self.averages:
+            if avg_name in self.category_names:
+                raise ValueError(
+                    f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.'
+                )
+
+    def extract_metrics_from_state(self, state: State) -> Dict[str, float]:
         results = {}
 
         for key in self.logger_keys:
@@ -121,23 +161,22 @@ def compute_averages(self, state: State) -> Dict[str, float]:
         return {k: sum(v) / len(v) for k, v in results.items()}
 
     def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
-        new_metrics = self.compute_averages(state)
-        if len(new_metrics) == 0:
+        computed_metrics = self.extract_metrics_from_state(state)
+        if len(computed_metrics) == 0:
             return {}
-        composite_scores = {}
-
+        category_scores = {}
         for category in self.categories:
             missing_metrics = []
-            composite_scores[category['name']] = []
+            category_scores[category['name']] = []
             for benchmark in category['benchmarks']:
                 key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
 
-                if key not in new_metrics:
+                if key not in computed_metrics:
                     log.warning(
                         f'Could not find results for benchmark: {benchmark}.')
                     missing_metrics.append(key)
                 else:
-                    score = new_metrics[key]
+                    score = computed_metrics[key]
 
                     if self.subtract_random_baseline:
                         score -= benchmark['random_baseline']
@@ -145,7 +184,7 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
                     if self.rescale_accuracy and self.subtract_random_baseline:
                         score /= 1.0 - benchmark['random_baseline']
 
-                    composite_scores[category['name']].append({
+                    category_scores[category['name']].append({
                         'name': benchmark['name'],
                         'score': score,
                         'weighting': benchmark['weighting']
@@ -155,23 +194,22 @@ def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]:
                 log.warning(
                     f"Removing category `{category['name']}` from scores because benchmarks were missing: {missing_metrics}"
                 )
-                del composite_scores[category['name']]
+                del category_scores[category['name']]
                 continue
             total_weight = sum(
-                k['weighting'] for k in composite_scores[category['name']])
-            composite_scores[category['name']] = sum(
+                k['weighting'] for k in category_scores[category['name']])
+            category_scores[category['name']] = sum(
                 k['score'] * (k['weighting'] / total_weight)
-                for k in composite_scores[category['name']])
+                for k in category_scores[category['name']])
 
-        composite_scores = {
+        named_averages = calculate_named_averages(self.averages,
+                                                  category_scores)
+        category_scores.update(named_averages)
+        category_scores = {
             f'icl/metrics/eval_gauntlet/{k}': v
-            for k, v in composite_scores.items()
+            for k, v in category_scores.items()
         }
-
-        composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
-            composite_scores.values()) / len(composite_scores.values()) if len(
-                composite_scores.values()) > 0 else 0
         if logger is not None:
-            logger.log_metrics(composite_scores)
+            logger.log_metrics(category_scores)
 
-        return composite_scores
+        return category_scores
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index aa3beda513..c79537c781 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -4,18 +4,20 @@
 import contextlib
 import copy
 import logging
+import math
 import os
 import tempfile
 from pathlib import Path
 from typing import Optional, Union
 
 import torch
-from composer.core import Callback, Event, State, Time
+from composer.core import Callback, Event, State, Time, TimeUnit
 from composer.core.state import fsdp_state_dict_type_context
 from composer.loggers import Logger, MLFlowLogger
-from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
 from composer.models import HuggingFaceModel
-from composer.utils import dist, format_name_with_dist_and_time, parse_uri
+from composer.utils import (dist, format_name_with_dist_and_time,
+                            maybe_create_remote_uploader_downloader_from_uri,
+                            parse_uri)
 from composer.utils.misc import create_interval_scheduler
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
@@ -52,12 +54,11 @@ def __init__(
         save_interval: Union[str, int, Time],
         huggingface_folder_name: str = 'ba{batch}',
         precision: str = 'float32',
-        overwrite: bool = False,
+        overwrite: bool = True,
         mlflow_registered_model_name: Optional[str] = None,
         mlflow_logging_config: Optional[dict] = None,
     ):
-        self.backend, self.bucket_name, self.save_dir_format_str = parse_uri(
-            save_folder)
+        _, _, self.save_dir_format_str = parse_uri(save_folder)
         self.overwrite = overwrite
         self.precision = precision
         self.dtype = {
@@ -73,25 +74,30 @@ def __init__(
         if self.mlflow_registered_model_name is not None:
             # Both the metadata and the task are needed in order for mlflow
             # and databricks optimized model serving to work
-            if 'metadata' not in mlflow_logging_config:
-                mlflow_logging_config['metadata'] = {
-                    'task': 'llm/v1/completions'
-                }
-            if 'task' not in mlflow_logging_config:
-                mlflow_logging_config['task'] = 'text-generation'
+            default_metadata = {'task': 'llm/v1/completions'}
+            passed_metadata = mlflow_logging_config.get('metadata', {})
+            mlflow_logging_config['metadata'] = {
+                **default_metadata,
+                **passed_metadata
+            }
+            mlflow_logging_config.setdefault('task', 'text-generation')
         self.mlflow_logging_config = mlflow_logging_config
 
         self.huggingface_folder_name_fstr = os.path.join(
             'huggingface', huggingface_folder_name)
+
+        if isinstance(save_interval, str):
+            save_interval = Time.from_timestring(save_interval)
+        if isinstance(save_interval, int):
+            save_interval = Time(save_interval, TimeUnit.EPOCH)
+
+        self.save_interval = save_interval
         self.check_interval = create_interval_scheduler(
             save_interval, include_end_of_training=True)
-        self.upload_to_object_store = (self.backend != '')
-        if self.upload_to_object_store:
-            self.remote_ud = RemoteUploaderDownloader(
-                bucket_uri=f'{self.backend}://{self.bucket_name}',
-                num_concurrent_uploads=4)
-        else:
-            self.remote_ud = None
+        self.remote_ud = maybe_create_remote_uploader_downloader_from_uri(
+            save_folder, loggers=[])
+        if self.remote_ud is not None:
+            self.remote_ud._num_concurrent_uploads = 4
 
         self.last_checkpoint_batch: Optional[Time] = None
         self.mlflow_loggers = []
@@ -107,7 +113,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
                 raise ValueError(
                     f'`HuggingFaceCheckpointer` is only compatible with `HuggingFaceModel`s. '
                     + f'Got {type(state.model)} instead.')
-            if self.upload_to_object_store and self.remote_ud is not None:
+            if self.remote_ud is not None:
                 self.remote_ud.init(state, logger)
                 state.callbacks.append(self.remote_ud)
 
@@ -128,6 +134,21 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
                 mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set(
                     '5GB')
 
+    def _is_last_batch(self, state: State):
+        elapsed_duration = state.get_elapsed_duration()
+        if elapsed_duration is not None and elapsed_duration >= 1.0:
+            return True
+
+        assert state.max_duration is not None  # for pyright
+        # If the save interval is specified as 1dur, and the max duration is in epoch units
+        # we need a special case to identify we are on the last batch and should write the mlflow checkpoint
+        if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
+            assert state.dataloader_len is not None  # for pyright
+            return int(state.timestamp.batch) % math.ceil(
+                state.max_duration.value * state.dataloader_len) == 0
+
+        return False
+
     def _save_checkpoint(self, state: State, logger: Logger):
         del logger  # unused
 
@@ -146,7 +167,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 self.huggingface_folder_name_fstr), state.run_name,
             state.timestamp)
         dir_context_mgr = tempfile.TemporaryDirectory(
-        ) if self.upload_to_object_store else contextlib.nullcontext(
+        ) if self.remote_ud is not None else contextlib.nullcontext(
             enter_result=save_dir)
 
         with dir_context_mgr as temp_save_dir:
@@ -183,7 +204,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                         state_dict[k] = v.to(dtype=self.dtype)
 
             if dist.get_global_rank() == 0:
-                log.debug('Saving Hugging Face checkpoint to disk')
+                log.debug('Saving Hugging Face checkpoint in global rank 0')
 
                 copied_config = copy.deepcopy(original_model.config)
                 if copied_config.model_type == 'mpt':
@@ -210,11 +231,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
                     log.debug('Editing MPT files for HuggingFace compatibility')
                     edit_files_for_hf_compatibility(temp_save_dir)
 
-                if self.upload_to_object_store:
-                    assert self.remote_ud is not None
-                    log.info(
-                        f'Uploading HuggingFace formatted checkpoint to {self.backend}://{self.bucket_name}/{save_dir}'
-                    )
+                if self.remote_ud is not None:
+                    log.info(f'Uploading HuggingFace formatted checkpoint')
                     for filename in os.listdir(temp_save_dir):
                         self.remote_ud.upload_file(
                             state=state,
@@ -224,8 +242,8 @@ def _save_checkpoint(self, state: State, logger: Logger):
                             overwrite=self.overwrite,
                         )
 
-                elapsed_duration = state.get_elapsed_duration()
-                if self.mlflow_registered_model_name is not None and elapsed_duration is not None and elapsed_duration >= 1.0:
+                if self.mlflow_registered_model_name and self._is_last_batch(
+                        state):
                     components = {'model': new_model_instance}
                     if original_tokenizer is not None:
                         components['tokenizer'] = original_tokenizer
diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
index c997c865dd..8da436b9b1 100644
--- a/llmfoundry/data/__init__.py
+++ b/llmfoundry/data/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
+from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.denoising import (MixtureOfDenoisersCollator,
                                        build_text_denoising_dataloader)
 from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
@@ -18,4 +19,5 @@
     'build_text_dataloader',
     'NoConcatDataset',
     'ConcatTokensDataset',
+    'build_dataloader',
 ]
diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
new file mode 100644
index 0000000000..12741717be
--- /dev/null
+++ b/llmfoundry/data/dataloader.py
@@ -0,0 +1,44 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dataloader builder utilities."""
+
+from composer import DataSpec
+from omegaconf import DictConfig
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.denoising import build_text_denoising_dataloader
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.text_data import build_text_dataloader
+
+
+def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+                     device_batch_size: int) -> DataSpec:
+    """Builds a dataloader from a config.
+
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer that the model will use.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    """
+    if cfg.name == 'text':
+        return build_text_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'text_denoising':
+        return build_text_denoising_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'finetuning':
+        return build_finetuning_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    else:
+        raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py
index bc41945076..8ccf7f25e9 100644
--- a/llmfoundry/data/denoising.py
+++ b/llmfoundry/data/denoising.py
@@ -16,7 +16,7 @@
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizerBase
 
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator
 from llmfoundry.data.text_data import (StreamingTextDataset,
                                        get_tokens_per_batch_func)
 from llmfoundry.models import utils
@@ -375,19 +375,25 @@ def build_text_denoising_dataloader(
             cfg.dataset.max_seq_len (int): The maximum length of sequences
                 in the batch. See :class:`MixtureOfDenoisersCollator` docstring
                 for details.
-            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
+            cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes
                 a collator wrapper that packs device_batch_size*packing_ratio
                 raw examples into device_batch_size packed examples. This helps
                 minimize padding while preserving sequence integrity.
                 This adds `sequence_id` to the batch, which indicates which unique
                 sequence each token belongs to.
+
+                If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with
+                zero waste is selected.
+                In practice, this may result in > 0 waste because profiling is done on only a portion
+                of the dataset.
+
                 Note: Using this feature will not change device_batch_size but it
                     will determine the number of raw examples consumed by the dataloader
                     per batch. Some examples may be discarded if they do not fit when
                     packing.
                     Select packing_ratio **carefully** based on the dataset
                     statistics, max_seq_len, and tolerance for discarding samples!
-                    The packing code in `./packing.py` provides a script that can help
+                    The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             See :class:`StreamingTextDataset` for info on other standard config
                 options within `cfg.dataset`.
@@ -419,7 +425,7 @@ def build_text_denoising_dataloader(
             that the dataloader will produce.
 
     Note:
-        You can run the script inside `./packing.py` to quickly test the
+        You can use the script `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -471,13 +477,13 @@ def build_text_denoising_dataloader(
         remote=cfg.dataset.get('remote'),
         split=cfg.dataset.get('split'),
         shuffle=cfg.dataset.get('shuffle', False),
-        predownload=cfg.dataset.get('predownload', 100_000),
+        predownload=cfg.dataset.get('predownload', None),
         keep_zip=cfg.dataset.get('keep_zip', False),
         download_retry=cfg.dataset.get('download_retry', 2),
         download_timeout=cfg.dataset.get('download_timeout', 60),
-        validate_hash=cfg.dataset.get('validate_hash'),
+        validate_hash=cfg.dataset.get('validate_hash', None),
         shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
-        num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', 128),
+        num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None),
         batch_size=device_batch_size,
     )
 
@@ -492,7 +498,7 @@ def build_text_denoising_dataloader(
             raise NotImplementedError(
                 'On-the-fly packing is currently only supported for decoder-only formats.'
             )
-        collate_fn = BinPackWrapper(
+        collate_fn = BinPackCollator(
             collator=collate_fn,
             target_batch_size=device_batch_size,
             max_seq_len=cfg.dataset.max_seq_len,
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 2dde563ac6..b19cab841f 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -14,7 +14,7 @@
 
 from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
 from llmfoundry.data.finetuning.tasks import dataset_constructor
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
 from llmfoundry.data.text_data import get_tokens_per_batch_func
 
 log = logging.getLogger(__name__)
@@ -74,20 +74,26 @@ def build_finetuning_dataloader(cfg: DictConfig,
             cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow
                 the collator to trim padding. See :class:`Seq2SeqFinetuningCollator`
                 docstring for details. Default: ``False``.
-            cfg.dataset.packing_ratio (float, optional): If provided, this invokes
-                a collator wrapper that packs `device_batch_size*packing_ratio`
-                raw examples into `device_batch_size` packed examples. This helps
+            cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes
+                a collator wrapper that packs device_batch_size*packing_ratio
+                raw examples into device_batch_size packed examples. This helps
                 minimize padding while preserving sequence integrity.
                 This adds `sequence_id` to the batch, which indicates which unique
                 sequence each token belongs to.
+
+                If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with
+                zero waste is selected.
+                In practice, this may result in > 0 waste because profiling is done on only a portion
+                of the dataset.
+
                 Note: Using this feature will not change device_batch_size but it
                     will determine the number of raw examples consumed by the dataloader
                     per batch. Some examples may be discarded if they do not fit when
                     packing.
-                    Select `packing_ratio` **carefully** based on the dataset
-                    statistics, `max_seq_len`, and tolerance for discarding samples!
-                    The packing code in `../packing.py` provides a script that can help
-                    you choose the best `packing_ratio`.
+                    Select packing_ratio **carefully** based on the dataset
+                    statistics, max_seq_len, and tolerance for discarding samples!
+                    The script `scripts/misc/profile_packing.py` can help
+                    you choose the best packing_ratio.
             cfg.dataset.shuffle (bool): Whether to shuffle the dataset.
             ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
@@ -106,7 +112,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
         A pytorch dataloader
 
     Note:
-        You can run the script inside `../packing.py` to quickly test the
+        You can run the script inside `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -130,20 +136,20 @@ def build_finetuning_dataloader(cfg: DictConfig,
             epoch_size=cfg.dataset.get('epoch_size', None),
             predownload=cfg.dataset.get('predownload', None),
             cache_limit=cfg.dataset.get('cache_limit', None),
-            partition_algo=cfg.dataset.get('partition_algo', 'orig'),
+            partition_algo=cfg.dataset.get('partition_algo', 'relaxed'),
             num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None),
             batch_size=device_batch_size,
             shuffle=cfg.dataset.get('shuffle', False),
-            shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1b'),
+            shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1e'),
             shuffle_seed=cfg.dataset.get('shuffle_seed', 9176),
-            shuffle_block_size=cfg.dataset.get('shuffle_block_size', 1 << 18),
+            shuffle_block_size=cfg.dataset.get('shuffle_block_size', None),
             sampling_method=cfg.dataset.get('sampling_method', 'balanced'),
             sampling_granularity=cfg.dataset.get('sampling_granularity', 1),
             batching_method=cfg.dataset.get('batching_method', 'random'),
         )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         dl = DataLoader(
             dataset,
@@ -174,7 +180,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
             )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         if cfg.drop_last:
             world_size = dist.get_world_size()
@@ -367,25 +373,40 @@ def _build_hf_dataset_from_remote(
 
 
 def _build_collate_fn(
-    dataset_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+    dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
     device_batch_size: int
-) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]:
+) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]:
+    dataset_cfg = dataloader_cfg.dataset
+    max_seq_len = dataset_cfg.max_seq_len
+
     collate_fn = Seq2SeqFinetuningCollator(
         tokenizer=tokenizer,
-        max_seq_len=dataset_cfg.max_seq_len,
+        max_seq_len=max_seq_len,
         decoder_only_format=dataset_cfg.decoder_only_format,
         allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False),
     )
 
     packing_ratio = dataset_cfg.get('packing_ratio')
+    max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep')
     if packing_ratio is None:
-        if dataset_cfg.get('max_leftover_bins_to_keep') is not None:
+        if max_leftover_bins_to_keep is not None:
             raise ValueError(
                 'dataset.max_leftover_bins_to_keep has been defined, ' +\
                 'but dataset.packing_ratio has not been set. Please set ' +\
                 'the latter to turn on packing or remove the former from the config.')
         return collate_fn, device_batch_size
 
+    if packing_ratio == 'auto':
+        packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer,
+                                           device_batch_size)
+
+    if isinstance(packing_ratio, str):
+        raise ValueError(
+            'dataset.packing_ratio must be a float or "auto", but it was set to '
+            + f'{packing_ratio}.')
+
+    log.info(f'Using packing ratio {packing_ratio}')
+
     if packing_ratio == 1.0:
         return collate_fn, device_batch_size
     elif packing_ratio < 1.0:
@@ -396,13 +417,13 @@ def _build_collate_fn(
             'On-the-fly packing is currently only supported for decoder-only formats.'
         )
 
-    collate_fn = BinPackWrapper(
+    collate_fn = BinPackCollator(
         collator=collate_fn,
         target_batch_size=device_batch_size,
-        max_seq_len=dataset_cfg.max_seq_len,
+        max_seq_len=max_seq_len,
         pad_token_id=tokenizer.pad_token_id,
         padding_side=tokenizer.padding_side,
-        max_leftover_bins_to_keep=dataset_cfg.get('max_leftover_bins_to_keep'),
+        max_leftover_bins_to_keep=max_leftover_bins_to_keep,
     )
     n_examples_to_pack = int(device_batch_size * packing_ratio)
     return collate_fn, n_examples_to_pack
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index f2bd0239c8..bc712a7504 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -38,6 +38,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import datasets as hf_datasets
+from composer.utils import dist
 from omegaconf import DictConfig
 from streaming import StreamingDataset
 from transformers import PreTrainedTokenizerBase
@@ -87,12 +88,12 @@ class StreamingFinetuningDataset(StreamingDataset):
         keep_zip (bool): Whether to keep or delete the compressed form when decompressing
             downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
             `False``.
-        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
             streams. If ``None``, takes its value from the total number of underlying samples.
             Provide this field if you are weighting streams relatively to target a larger or
             smaller epoch size. Defaults to ``None``.
         predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
         cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
             shard cache. Before downloading a shard, the least recently used resident shard(s) may
             be evicted (deleted from the local cache) in order to stay under the limit. Set to None
@@ -100,15 +101,17 @@ class StreamingFinetuningDataset(StreamingDataset):
             bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
         partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
         num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
-            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
-            initial run.
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
         batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
             partitioned over the workers. Defaults to ``None``.
         shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
             ``False``.
-        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
         shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
-        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        shuffle_block_size (int): Unit of shuffle. If ``None``, its value is calculated as
+            ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to ``None``.
         sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
             Defaults to ``balanced``.
         sampling_granularity (int): When picking samples for a stream's final partial repeat,
@@ -128,16 +131,16 @@ def __init__(self,
                  download_timeout: float = 60,
                  validate_hash: Optional[str] = None,
                  keep_zip: bool = False,
-                 epoch_size: Optional[int] = None,
+                 epoch_size: Optional[Union[int, str]] = None,
                  predownload: Optional[int] = None,
                  cache_limit: Optional[Union[int, str]] = None,
-                 partition_algo: str = 'orig',
+                 partition_algo: str = 'relaxed',
                  num_canonical_nodes: Optional[int] = None,
                  batch_size: Optional[int] = None,
                  shuffle: bool = False,
-                 shuffle_algo: str = 'py1b',
+                 shuffle_algo: str = 'py1e',
                  shuffle_seed: int = 9176,
-                 shuffle_block_size: int = 1 << 18,
+                 shuffle_block_size: Optional[int] = None,
                  sampling_method: str = 'balanced',
                  sampling_granularity: int = 1,
                  batching_method: str = 'random',
@@ -332,6 +335,16 @@ def build_from_hf(
             preprocessing_fn = self.get_preprocessing_fn_from_str(
                 proto_preprocessing_fn, dataset_name)
 
+        signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed'
+
+        # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
+        # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks
+        # can just read them.
+        if dist.get_local_rank() != 0:
+            log.debug('Waiting for local_rank 0 to finish data prep')
+            with dist.local_rank_zero_download_and_wait(signal_file_path):
+                pass
+
         dataset = hf_datasets.load_dataset(dataset_name, split=split, **kwargs)
 
         def dataset_mapper(example: Dict):
@@ -339,34 +352,59 @@ def dataset_mapper(example: Dict):
                 example = preprocessing_fn(example)
             return _tokenize_formatted_example(example, tokenizer)
 
+        detected_cpu_count = os.cpu_count() or 1
+        detected_cpus_with_margin = detected_cpu_count - 8
+        num_cpus_to_use = max(1, detected_cpus_with_margin)
+
         columns_to_remove = list(dataset[0].keys())
         tokenized_dataset = dataset.map(
             dataset_mapper,
             batched=False,
             remove_columns=columns_to_remove,
+            num_proc=num_cpus_to_use,
+            desc='Tokenizing dataset',
+        )
+
+        pad_token_id = tokenizer.pad_token_id
+
+        def filter_long_or_empty_examples(example: Dict) -> bool:
+            less_than_max_seq_len = len(example['input_ids']) < max_seq_len
+            non_empty_input = len(example['input_ids']) > 0
+            non_empty_labels = len(example['labels']) > 0
+            non_padding_response = any(
+                token_id != pad_token_id for token_id in example['labels'])
+            return (less_than_max_seq_len and non_empty_input and
+                    non_empty_labels and non_padding_response)
+
+        filtered_dataset = tokenized_dataset.filter(
+            filter_long_or_empty_examples,
+            num_proc=num_cpus_to_use,
+            desc='Filtering out long prompts',
         )
-        prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len)
 
-        examples_removed = len(tokenized_dataset) - len(
-            prompt_length_filtered_dataset)
+        examples_removed = len(tokenized_dataset) - len(filtered_dataset)
         if examples_removed > 0:
             warnings.warn(
-                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
+                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}, '
+                +
+                'the prompt or response was empty, or the response was all padding tokens.'
             )
 
-        empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
-            lambda example: len(example['input_ids']) > 0 and len(example[
-                'labels']) > 0 and any(token_id != tokenizer.pad_token_id
-                                       for token_id in example['labels']))
-        empty_examples_removed = len(prompt_length_filtered_dataset) - len(
-            empty_examples_dropped_dataset)
-        if empty_examples_removed > 0:
-            warnings.warn(
-                f'Dropped {empty_examples_removed} examples where the prompt or response was empty, '
-                + 'or the response was only padding tokens.')
+        # Now local rank 0 indicates to the other ranks that it is done
+        if dist.get_local_rank() == 0:
+            log.debug('Local rank 0 finished data prep')
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_data_prep')
+
+        # All ranks sync up at this barrier, having completed data processing
+        dist.barrier()
+
+        # Last, local rank 0 cleans up the signal file
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
 
-        return empty_examples_dropped_dataset
+        log.debug('All ranks finished data prep')
+        return filtered_dataset
 
     def build_from_streaming(self, *args: Any,
                              **kwargs: Any) -> StreamingFinetuningDataset:
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 1532de276e..45322c9b2f 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -1,8 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
@@ -10,7 +9,7 @@
 from transformers import PreTrainedTokenizerBase
 
 
-class BinPackWrapper:
+class BinPackCollator:
     """Utility collator for packing to reduce padding."""
 
     def __init__(self,
@@ -33,13 +32,10 @@ def __init__(self,
         if self.pad_token_id < 0:
             raise ValueError(f'{pad_token_id=} must be >=0.')
 
-        if max_leftover_bins_to_keep is None:
-            self.max_leftover_bins_to_keep = int(10 * self.out_size)
-        elif max_leftover_bins_to_keep < 0:
+        if max_leftover_bins_to_keep is not None and max_leftover_bins_to_keep < 0:
             raise ValueError(
                 f'{max_leftover_bins_to_keep=} must be >=0 or None.')
-        else:
-            self.max_leftover_bins_to_keep = int(max_leftover_bins_to_keep)
+        self.max_leftover_bins_to_keep = max_leftover_bins_to_keep
 
         self.n_packed_tokens = 0
         self.n_total_tokens = 0
@@ -60,7 +56,9 @@ def __call__(
             self,
             examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
         batch = self.base_collator(examples)
+        return self.pack(batch)
 
+    def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         assert 'attention_mask' in batch
         assert 'input_ids' in batch
 
@@ -75,12 +73,12 @@ def __call__(
         # Cut everything down to size
         sizes, trimmed_examples = [], []
         for idx in range(batch['attention_mask'].shape[0]):
-            size, trimmed_example = extract_trim_batch_idx(batch, idx)
+            size, trimmed_example = _extract_trim_batch_idx(batch, idx)
             sizes.append(size)
             trimmed_examples.append(trimmed_example)
 
         # Apply our CS 101 bin packing algorithm.
-        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = first_fit_bin_packing(
+        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing(
             sizes=sizes,
             examples=trimmed_examples,
             num_bins=self.out_size,
@@ -93,15 +91,15 @@ def __call__(
         self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep]
 
         # Re-pad to max_seq_len and batch
-        batch = repad(packed_examples,
-                      max_seq_len=self.max_seq_len,
-                      pad_token_id=self.pad_token_id,
-                      padding_side=self.padding_side)
+        batch = _repad(packed_examples,
+                       max_seq_len=self.max_seq_len,
+                       pad_token_id=self.pad_token_id,
+                       padding_side=self.padding_side)
         return batch
 
 
-def extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
-                           idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
+def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
+                            idx: int) -> Tuple[int, Dict[str, torch.Tensor]]:
     example = {k: v[idx] for k, v in batch.items()}
 
     keep = example['attention_mask'] == 1
@@ -112,7 +110,7 @@ def extract_trim_batch_idx(batch: Dict[str, torch.Tensor],
     return size, trim_example
 
 
-def combine_in_place(
+def _combine_in_place(
         example: Dict[str, torch.Tensor],
         add_on: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     if 'labels' in add_on:
@@ -129,7 +127,7 @@ def combine_in_place(
     return example
 
 
-def first_fit_bin_packing(
+def _first_fit_bin_packing(
     sizes: List[int], examples: List[Dict[str, torch.Tensor]], num_bins: int,
     max_bin_size: int, existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]]
 ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[
@@ -194,7 +192,7 @@ def first_fit_bin_packing(
             if bins[bidx][0] + size <= max_bin_size:
                 bin_size, packed_example = bins.pop(bidx)
                 bin_size = bin_size + size
-                packed_example = combine_in_place(packed_example, example)
+                packed_example = _combine_in_place(packed_example, example)
                 bins.append((bin_size, packed_example))
                 added = True
                 break
@@ -225,8 +223,8 @@ def first_fit_bin_packing(
         bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:]
 
 
-def repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int,
-          pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
+def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int,
+           pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]:
 
     def pad_tensor(tensor: torch.Tensor, pad_value: int):
         if len(tensor) == max_seq_len:
@@ -260,14 +258,169 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int):
     return batch
 
 
+def auto_packing_ratio(dataloader_cfg: DictConfig,
+                       tokenizer: PreTrainedTokenizerBase,
+                       device_batch_size: int,
+                       num_packing_ratios: int = 20) -> float:
+    """Find a packing ratio that minimizes padding with zero waste.
+
+    By packing examples, we can increase training efficiency, training on more data with less batches.
+    However, in practice, the selected packing_ratio may produce some waste because profiling is done on only
+    a subset of the dataset.
+
+    We select a min_ratio of 1 and a max_ratio that is the max_seq_len / 100, and profile up to
+    num_packing_ratios packing ratios between min_ratio and max_ratio, inclusive.
+    When a packing_ratio with non-zero waste is found, we stop and select the previous ratio,
+    which has zero waste.
+
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+        num_packing_ratio (int): The number of packing ratios to try.
+
+    Returns:
+        A packing ratio that minimizes padding while maintaining zero waste.
+    """
+    from composer.utils import dist, get_device, reproducibility
+
+    # Stash the rng state to restore later.
+    rng_state = reproducibility.get_rng_state()
+    # Set the seed so that auto packing is deterministic.
+    reproducibility.seed_all(0)
+
+    min_ratio = 1
+    max_ratio = dataloader_cfg.dataset.max_seq_len / 100
+    profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio,
+                                        max_ratio, num_packing_ratios,
+                                        device_batch_size)
+
+    # Obtain the maximum packing_ratio/minimum padding that has no waste.
+    # profiling_results are sorted from smallest to largest packing_ratio.
+    packing_ratio = 1
+    for packing_ratio_candidate, _, waste in profiling_results:
+        if waste > 0:
+            break
+        packing_ratio = packing_ratio_candidate
+
+    # Select the minimum packing ratio across all ranks.
+    if dist.is_available() and dist.is_initialized():
+        device = get_device(None)
+        packing_ratio_tensor = device.tensor_to_device(
+            torch.tensor(packing_ratio))
+        dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN')
+        packing_ratio = packing_ratio_tensor.item()
+
+    # Restore rng state.
+    reproducibility.load_rng_state(rng_state)
+
+    return packing_ratio
+
+
+def profile_packing(
+        dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+        min_ratio: float, max_ratio: float, num_packing_ratios: int,
+        device_batch_size: int) -> Iterable[Tuple[float, float, float]]:
+    """Generator function that profiles example packing across packing ratios.
+
+    Args:
+        dataloader_cfg (DictConfig): The dataloader configuration for profiling.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
+        min_ratio (float): Smallest packing_ratio to test. Must be >=1.
+        max_ratio (float): Largest packing_ratio to test. Must be larger than `min_ratio`.
+        num_packing_ratios (int): Number of packing_ratio values (spaced between `min_ratio` and `max_ratio`) to try.
+        device_batch_size (int): The size of the batches (number of examples) per device.
+
+    Returns:
+        An iterable of tuples of packing ratio, padding, and waste, sorted by smallest to largest packing ratio.
+    """
+    import copy
+
+    from llmfoundry.data.dataloader import build_dataloader
+
+    max_seq_len = dataloader_cfg.dataset.get('max_seq_len')
+    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep',
+                                                       None)
+
+    # Turn off packing for the dataloader (we want raw, pre-packed examples)
+    dataloader_cfg = copy.deepcopy(dataloader_cfg)
+    dataloader_cfg.dataset.packing_ratio = None
+    dataloader_cfg.drop_last = False
+    dataloader_cfg.num_workers = 0
+    dataloader_cfg.prefetch_factor = None
+    dataloader_cfg.persistent_workers = False
+
+    # Determine the packing_ratio values we'll try
+    packing_ratios, raw_batch_sizes = [], []
+    for packing_ratio in np.linspace(min_ratio,
+                                     max_ratio,
+                                     num_packing_ratios,
+                                     endpoint=True):
+        packing_ratio = np.round(10 * packing_ratio) / 10
+        raw_batch_size = int(packing_ratio * device_batch_size)
+        if raw_batch_size not in raw_batch_sizes:
+            packing_ratios.append(packing_ratio)
+            raw_batch_sizes.append(raw_batch_size)
+
+    n_profile_examples = max(raw_batch_sizes) * 100
+
+    train_dataspec = build_dataloader(dataloader_cfg, tokenizer,
+                                      n_profile_examples)
+    train_dataloader = train_dataspec.dataloader
+
+    # Get a bunch of raw examples
+    big_batch = next(iter(train_dataloader))
+
+    def split_big_batch(raw_batch_size: int) -> List:
+        input_ids = big_batch['input_ids'].split(raw_batch_size)
+        batches = [{'input_ids': x} for x in input_ids]
+
+        for key in big_batch.keys():
+            if key == 'input_ids':
+                continue
+            for idx, split in enumerate(big_batch[key].split(raw_batch_size)):
+                batches[idx].update({key: split})
+        return batches
+
+    def profile(raw_batch_size: int) -> Tuple[float, float]:
+        packer = BinPackCollator(
+            collator=lambda x: x,
+            target_batch_size=device_batch_size,
+            max_seq_len=max_seq_len,
+            pad_token_id=0,  # <-- Doesn't need to be correct for profiling
+            padding_side='left',  # <-- Doesn't need to be correct for profiling
+            max_leftover_bins_to_keep=max_leftovers_to_keep)
+
+        # Simulate feeding the packing collator a bunch of data
+        for batch in split_big_batch(raw_batch_size):
+            if batch['input_ids'].shape[0] < device_batch_size:
+                continue
+            _ = packer.pack(batch)
+
+        # Return the padding / waste stats over that bunch of data
+        padding_percent = 100 * (1 - packer.efficiency)
+        waste_percent = 100 * packer.waste
+        return padding_percent, waste_percent
+
+    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
+        padding, waste = profile(raw_batch_size)
+        yield (packing_ratio, padding, waste)
+
+
 if __name__ == '__main__':
+
+    import warnings
+
+    warnings.warn(
+        DeprecationWarning(
+            'Please use scripts/misc/profile_packing.py to profile packing.' +
+            'This script will be removed in later releases.'))
+
+    import os
     from argparse import ArgumentParser, Namespace
 
     from omegaconf import OmegaConf as om
 
-    from llmfoundry import (build_finetuning_dataloader,
-                            build_text_denoising_dataloader)
-    from llmfoundry.data import build_text_dataloader
     from llmfoundry.utils import build_tokenizer
 
     def parse_args() -> Namespace:
@@ -296,7 +449,7 @@ def parse_args() -> Namespace:
         parser.add_argument(
             '--num-packing-ratios',
             type=int,
-            default=10,
+            default=20,
             help=
             'Number of packing_ratio values (spaced between `min` and `max) to try.'
         )
@@ -316,20 +469,6 @@ def parse_args() -> Namespace:
             raise ValueError('`num_packing_ratios` must be a positive integer.')
         return args
 
-    def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
-                         device_batch_size: int):
-        if cfg.name == 'text':
-            return build_text_dataloader(cfg, tokenizer, device_batch_size)
-        elif cfg.name == 'text_denoising':
-            return build_text_denoising_dataloader(cfg, tokenizer,
-                                                   device_batch_size)
-        elif cfg.name == 'finetuning':
-            return build_finetuning_dataloader(cfg, tokenizer,
-                                               device_batch_size)
-        else:
-            raise ValueError(
-                f'Not sure how to build dataloader with config: {cfg}')
-
     args = parse_args()
 
     with open(args.yaml_path) as f:
@@ -339,26 +478,11 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
         cfg = om.create(cfg)
     device_batch_size = cfg.global_train_batch_size // args.num_devices
 
-    # Determine the packing_ratio values we'll try
-    packing_ratios, raw_batch_sizes = [], []
-    for packing_ratio in np.linspace(args.min,
-                                     args.max,
-                                     args.num_packing_ratios,
-                                     endpoint=True):
-        packing_ratio = np.round(10 * packing_ratio) / 10
-        raw_batch_size = int(packing_ratio * device_batch_size)
-        if raw_batch_size not in raw_batch_sizes:
-            packing_ratios.append(packing_ratio)
-            raw_batch_sizes.append(raw_batch_size)
-
     # Fetch a bunch of raw examples once, which we'll re-use
     if 'train_loader' not in cfg:
         raise ValueError('config must define train_loader')
     dataloader_cfg = cfg.train_loader
 
-    max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep',
-                                                       None)
-
     # build tokenizer
     if 'tokenizer' not in cfg:
         raise ValueError('config must define tokenizer')
@@ -367,57 +491,19 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
     if not isinstance(resolved_tokenizer_cfg, Dict):
         raise ValueError(
             'tokenizer config needs to be resolved by omegaconf into a Dict.')
-    tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg
+    tokenizer_cfg = resolved_tokenizer_cfg
 
     tokenizer_name = tokenizer_cfg['name']
     tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
     tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
 
-    # Turn off packing for the dataloader (we want raw, pre-packed examples)
-    dataloader_cfg.dataset.packing_ratio = None
-    dataloader_cfg.dataset.max_leftovers_to_keep = None
-    train_dataloader = build_dataloader(dataloader_cfg, tokenizer,
-                                        max(raw_batch_sizes) * 100).dataloader
-
-    # Get a bunch of raw examples
-    big_batch = next(iter(train_dataloader))
-
-    def split_big_batch(raw_batch_size: int) -> List:
-        input_ids = big_batch['input_ids'].split(raw_batch_size)
-        batches = [{'input_ids': x} for x in input_ids]
-
-        for key in big_batch.keys():
-            if key == 'input_ids':
-                continue
-            for idx, split in enumerate(big_batch[key].split(raw_batch_size)):
-                batches[idx].update({key: split})
-        return batches
-
-    def profile_packing(raw_batch_size: int) -> Tuple[float, float]:
-        packer = BinPackWrapper(
-            collator=lambda x: x,
-            target_batch_size=device_batch_size,
-            max_seq_len=dataloader_cfg.dataset.max_seq_len,
-            pad_token_id=0,  # <-- Doesn't need to be correct for profiling
-            padding_side='left',  # <-- Doesn't need to be correct for profiling
-            max_leftover_bins_to_keep=max_leftovers_to_keep)
-
-        # Simulate feeding the packing collator a bunch of data
-        for batch in split_big_batch(raw_batch_size):
-            if batch['input_ids'].shape[0] < device_batch_size:
-                continue
-            _ = packer(batch)
-
-        # Return the padding / waste stats over that bunch of data
-        padding_percent = 100 * (1 - packer.efficiency)
-        waste_percent = 100 * packer.waste
-        return padding_percent, waste_percent
+    results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max,
+                              args.num_packing_ratios, device_batch_size)
 
     header = '\n\n\n packing_ratio | % PADDING | % WASTE'
     fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
 
     print(header)
     print('-' * len(header))
-    for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes):
-        padding, waste = profile_packing(raw_batch_size)
+    for packing_ratio, padding, waste in results:
         print(fstr.format(packing_ratio, padding, waste))
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
index 93af2f63ed..51fd6b38dc 100644
--- a/llmfoundry/data/text_data.py
+++ b/llmfoundry/data/text_data.py
@@ -46,12 +46,12 @@ class StreamingTextDataset(StreamingDataset):
         keep_zip (bool): Whether to keep or delete the compressed form when decompressing
             downloaded shards. If ``False``, keep iff remote is local or no remote. Defaults to
             `False``.
-        epoch_size (int, optional): Number of samples to draw per epoch balanced across all
+        epoch_size (Union[int, str], optional): Number of samples to draw per epoch balanced across all
             streams. If ``None``, takes its value from the total number of underlying samples.
             Provide this field if you are weighting streams relatively to target a larger or
             smaller epoch size. Defaults to ``None``.
         predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
+            iterating. If ``None``, its value is set to ``8 * batch_size``. Defaults to ``None``.
         cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's
             shard cache. Before downloading a shard, the least recently used resident shard(s) may
             be evicted (deleted from the local cache) in order to stay under the limit. Set to None
@@ -59,15 +59,19 @@ class StreamingTextDataset(StreamingDataset):
             bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None.
         partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``.
         num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with
-            resumption. Defaults to ``None``, which is interpreted as the number of nodes of the
-            initial run.
+            resumption. If ``None``, this is interpreted as 64 times the number of physical
+            nodes of the initial run if ``shuffle_algo`` is ``py1s`` or ``py2s``, and simply the
+            number of physical nodes of the initial run otherwise. Defaults to ``None``.
         batch_size (int, optional): Batch size of its DataLoader, which affects how the dataset is
             partitioned over the workers. Defaults to ``None``.
         shuffle (bool): Whether to iterate over the samples in randomized order. Defaults to
             ``False``.
-        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1b``.
+        shuffle_algo (str): Which shuffling algorithm to use. Defaults to ``py1e``.
         shuffle_seed (int): Seed for Deterministic data shuffling. Defaults to ``9176``.
-        shuffle_block_size (int): Unit of shuffle. Defaults to ``1 << 18``.
+        shuffle_block_size (int, optional): Unit of shuffle. A canonical node's samples are split
+            into blocks of this size, and samples within each block are shuffled. If ``None``, its
+            value is calculated as ``max(4_000_000 // num_canonical_nodes), 1 << 18)``. Defaults to
+            ``None``.
         sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
             Defaults to ``balanced``.
         sampling_granularity (int): When picking samples for a stream's final partial repeat,
@@ -89,16 +93,16 @@ def __init__(self,
                  download_timeout: float = 60,
                  validate_hash: Optional[str] = None,
                  keep_zip: bool = False,
-                 epoch_size: Optional[int] = None,
-                 predownload: int = 100_000,
+                 epoch_size: Optional[Union[int, str]] = None,
+                 predownload: Optional[int] = None,
                  cache_limit: Optional[Union[int, str]] = None,
-                 partition_algo: str = 'orig',
+                 partition_algo: str = 'relaxed',
                  num_canonical_nodes: Optional[int] = None,
                  batch_size: Optional[int] = None,
                  shuffle: bool = False,
-                 shuffle_algo: str = 'py1b',
+                 shuffle_algo: str = 'py1e',
                  shuffle_seed: int = 9176,
-                 shuffle_block_size: int = 1 << 18,
+                 shuffle_block_size: Optional[int] = None,
                  sampling_method: str = 'balanced',
                  sampling_granularity: int = 1,
                  batching_method: str = 'random',
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 13857e9bb9..d52633a09b 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import warnings
 from typing import Mapping, Union
 
 # required for loading a python model into composer
@@ -24,8 +25,7 @@
 
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
-from llmfoundry.models.layers.llama_attention_monkeypatch import \
-    get_llama_attention_patch_fn
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.utils import init_empty_weights
 
 try:
@@ -95,12 +95,28 @@ def __init__(self, om_model_config: Union[DictConfig,
             # load the model config
             trust_remote_code = om_model_config.get('trust_remote_code', True)
             use_auth_token = om_model_config.get('use_auth_token', False)
+            use_flash_attention_2 = om_model_config.get('use_flash_attention_2',
+                                                        False)
+            if use_flash_attention_2 and not is_flash_v2_installed():
+                raise ValueError(
+                    'use_flash_attention_2 is set to True, but flash-attention 2 is not installed. '
+                    + 'Please install flash_attn==2.3.2`.')
+
             config = AutoConfig.from_pretrained(
                 om_model_config.pretrained_model_name_or_path,
                 trust_remote_code=trust_remote_code,
                 use_auth_token=use_auth_token,
             )
 
+            # This is not how you are supposed to set this, but transformers currently only
+            # supports enabling flash attention 2 when using the from_pretrained API.
+            # We need to support it for both from_pretrained and from_config, so we have to
+            # set the private attribute here. This will just skip all of transformers'
+            # validation logic that it is ok to use flash attention 2, so we check
+            # whether it is installed above, and whether the chosen config supports it here.
+            # https://github.com/huggingface/transformers/issues/26878
+            config._flash_attn_2_enabled = use_flash_attention_2
+
             # set config overrides
             for k, v in om_model_config.get('config_overrides', {}).items():
                 if not hasattr(config, k):
@@ -142,6 +158,24 @@ def __init__(self, om_model_config: Union[DictConfig,
             if dist.get_local_rank() != 0 and init_device == 'mixed':
                 om_model_config.pretrained = False
 
+            # If the HuggingFace model is coming from a local folder, Hugging Face copies the modules into the
+            # transformers modules cache. On particular systems, this operation seems to cause contention between
+            # the different processes. To avoid this contention, we first create the model (on meta device) on local rank
+            # zero. This will set up the transformers model cache and avoid the future contention.
+            if dist.get_local_rank() == 0 and os.path.isdir(
+                    om_model_config.pretrained_model_name_or_path):
+                with init_empty_weights(include_buffers=False):
+                    with warnings.catch_warnings():
+                        warnings.simplefilter('ignore', UserWarning)
+                        AutoModelForCausalLM.from_pretrained(
+                            om_model_config.pretrained_model_name_or_path,
+                            trust_remote_code=trust_remote_code,
+                            use_auth_token=use_auth_token,
+                            config=config,
+                        )
+
+            dist.barrier()
+
             # initialize the model on the correct device
             if resolved_init_device == 'cpu':
                 if om_model_config.pretrained:
@@ -200,6 +234,9 @@ def __init__(self, om_model_config: Union[DictConfig,
                 )
                 from transformers.models.llama.modeling_llama import \
                     LlamaAttention
+
+                from llmfoundry.models.layers.llama_attention_monkeypatch import \
+                    get_llama_attention_patch_fn
                 LlamaAttention.forward = get_llama_attention_patch_fn(
                     attention_patch_type)
                 model.config.use_cache = False
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index c8d578cb2d..cb7e1451fa 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -5,7 +5,7 @@
 
 import math
 import warnings
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
@@ -17,12 +17,13 @@
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
 
-def is_flash_v2_installed():
+def is_flash_v2_installed(v2_version: str = '2.0.0'):
+    assert version.parse(v2_version) >= version.parse('2.0.0')
     try:
         import flash_attn as flash_attn
     except:
         return False
-    return version.parse(flash_attn.__version__) >= version.parse('2.0.0')
+    return version.parse(flash_attn.__version__) >= version.parse(v2_version)
 
 
 def is_flash_v1_installed():
@@ -33,6 +34,16 @@ def is_flash_v1_installed():
     return version.parse(flash_attn.__version__) < version.parse('2.0.0')
 
 
+# Before importing any transformers models, we need to disable transformers flash attention if
+# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+# gated import otherwise.
+if is_flash_v1_installed():
+    import transformers
+    transformers.utils.is_flash_attn_available = lambda: False
+
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
                      original_is_causal: bool) -> bool:
     # disable causal when it is not needed
@@ -70,7 +81,7 @@ def scaled_multihead_dot_product_attention(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -80,8 +91,8 @@ def scaled_multihead_dot_product_attention(
     needs_weights: bool = False,
     multiquery: bool = False,
     tensor_parallel_qkvo: bool = False,
-    tp_world_size: Optional[int] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+    tp_world_size: Optional[int] = None
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
 
     if multiquery:
@@ -195,7 +206,7 @@ def scaled_multihead_dot_product_attention(
 
 
 def check_valid_inputs(*tensors: torch.Tensor,
-                       valid_dtypes: Optional[List[torch.dtype]] = None):
+                       valid_dtypes: Optional[list[torch.dtype]] = None):
     if valid_dtypes is None:
         valid_dtypes = [torch.float16, torch.bfloat16]
     for tensor in tensors:
@@ -211,7 +222,7 @@ def flash_attn_fn(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -220,7 +231,7 @@ def flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
         from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
@@ -349,7 +360,7 @@ def triton_flash_attn_fn(
     value: torch.Tensor,
     n_heads: int,
     kv_n_heads: Optional[int] = None,
-    past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.Tensor] = None,
@@ -360,8 +371,7 @@ def triton_flash_attn_fn(
     multiquery: bool = False,
     tensor_parallel_qkvo: bool = False,
     tp_world_size: Optional[int] = None,
-
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
         from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
@@ -584,12 +594,13 @@ def __init__(
     def forward(
         self,
         x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         attn_bias: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb_w_meta_info: Optional[dict] = None,
         is_causal: bool = True,
         needs_weights: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[
             torch.Tensor, torch.Tensor]]]:
         qkv = self.Wqkv(x)
 
@@ -625,6 +636,39 @@ def forward(
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
 
+        if rotary_emb_w_meta_info is not None:
+            rotary_emb = rotary_emb_w_meta_info['rotary_emb']
+            seq_len = rotary_emb_w_meta_info['seq_len']
+            offset_info = rotary_emb_w_meta_info['offset_info']
+            bsz, seqlen = query.shape[:2]
+            query = query.view(bsz, seqlen, -1, self.head_dim)
+            key = key.view(bsz, seqlen, -1, self.head_dim)
+
+            if rotary_emb_w_meta_info['impl'] == 'dail':
+                value = value.view(bsz, seqlen, -1, self.head_dim)
+
+                kv = torch.stack([key, value], dim=2)
+                query, kv = rotary_emb(query,
+                                       kv,
+                                       seqlen_offset=offset_info,
+                                       max_seqlen=seq_len)
+                [key, value] = torch.unbind(kv, dim=2)
+
+                value = value.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+            elif rotary_emb_w_meta_info['impl'] == 'hf':
+                (cos, sin) = rotary_emb(value, seq_len)
+                # The following two transposes should be removed once the transformers library allows for the specification of the dimension for heads in the call to apply_rotary_pos_emb
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+                query, key = apply_rotary_pos_emb(query, key, cos, sin,
+                                                  offset_info)
+                # The following two transposes should be removed once the transformers library allows for the specification of the dimension for heads in the call to apply_rotary_pos_emb
+                query = query.transpose(1, 2)
+                key = key.transpose(1, 2)
+
+            query = query.view(bsz, seqlen, self.d_model)
+            key = key.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
+
         context, attn_weights, past_key_value = self.attn_fn(
             query,
             key,
@@ -675,8 +719,6 @@ def __init__(
             kv_n_heads=n_heads,  # for MHA, same # heads as kv groups
             attn_impl=attn_impl,
             clip_qkv=clip_qkv,
-            tensor_parallel_qkvo=tensor_parallel_qkvo,
-            tp_world_size=tp_world_size,
             qk_ln=qk_ln,
             tensor_parallel_qkvo=tensor_parallel_qkvo,
             tp_world_size=tp_world_size,
@@ -733,7 +775,7 @@ def __init__(
 def attn_bias_shape(
         attn_impl: str, n_heads: int, seq_len: int, alibi: bool,
         prefix_lm: bool, causal: bool,
-        use_sequence_id: bool) -> Optional[Tuple[int, int, int, int]]:
+        use_sequence_id: bool) -> Optional[tuple[int, int, int, int]]:
     if attn_impl == 'flash':
         return None
     elif attn_impl in ['torch', 'triton']:
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index a08ef6d77f..2bd678ddb1 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -12,6 +12,33 @@
 from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, build_ffn
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
+attn_config_defaults: Dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'triton',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': False,
+    'alibi_bias_max': 8,
+    'rope': False,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+    'tensor_parallel_qkvo': False,
+    'tp_world_size': None,
+}
+
 
 class MPTBlock(nn.Module):
 
@@ -30,18 +57,7 @@ def __init__(
         **kwargs: Any,
     ):
         if attn_config is None:
-            attn_config = {
-                'attn_type': 'multihead_attention',
-                'attn_pdrop': 0.0,
-                'attn_impl': 'triton',
-                'qk_ln': False,
-                'clip_qkv': None,
-                'softmax_scale': None,
-                'prefix_lm': False,
-                'attn_uses_sequence_id': False,
-                'alibi': False,
-                'alibi_bias_max': 8,
-            }
+            attn_config = attn_config_defaults
 
         if ffn_config is None:
             ffn_config = {
@@ -58,7 +74,8 @@ def __init__(
         # necessary to avoid passing extraneous args into attn_class while allowing the use of **kwargs
         args_to_exclude_in_attn_class = {
             'attn_type', 'prefix_lm', 'alibi', 'attn_uses_sequence_id',
-            'alibi_bias_max'
+            'alibi_bias_max', 'rope', 'rope_theta', 'rope_impl',
+            'rope_dail_config', 'rope_hf_config'
         }
         attn_config_subset_for_attn_class = {
             k: v
@@ -94,6 +111,7 @@ def forward(
         x: torch.Tensor,
         past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attn_bias: Optional[torch.Tensor] = None,
+        rotary_emb_w_meta_info: Optional[Dict] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
         is_causal: bool = True,
         output_attentions: bool = False,
@@ -104,6 +122,7 @@ def forward(
             a,
             past_key_value=past_key_value,
             attn_bias=attn_bias,
+            rotary_emb_w_meta_info=rotary_emb_w_meta_info,
             attention_mask=attention_mask,
             is_causal=is_causal,
             needs_weights=output_attentions,
diff --git a/llmfoundry/models/layers/llama_attention_monkeypatch.py b/llmfoundry/models/layers/llama_attention_monkeypatch.py
index 88f61e3fef..9ceeb0747e 100644
--- a/llmfoundry/models/layers/llama_attention_monkeypatch.py
+++ b/llmfoundry/models/layers/llama_attention_monkeypatch.py
@@ -78,6 +78,8 @@ def llama_attention_patch_torch(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
+    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(
@@ -186,6 +188,8 @@ def llama_attention_patch_triton(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    # Temporary fix for llama2 transformers compatibility, padding_mask will be deprecated in the next transformers release after 4.34.1.
+    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if use_cache:
         raise NotImplementedError(
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 9d13617ab7..0df6f7c29a 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -7,21 +7,16 @@
 from typing import Any, Dict, Optional, Union
 
 from transformers import PretrainedConfig
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.layers.blocks import attn_config_defaults
 
-attn_config_defaults: Dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'tensor_parallel_qkvo': False,
-    'tp_world_size': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8,
-}
+# NOTE: All utils are imported directly even if unused so that
+# HuggingFace can detect all the needed files to copy into its modules folder.
+# Otherwise, certain modules are missing.
+# isort: off
+from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY  # type: ignore (see note)
+from llmfoundry.models.layers.norm import LPLayerNorm  # type: ignore (see note)
+from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY  # type: ignore (see note)
 
 ffn_config_defaults: Dict = {
     'ffn_type': 'mptmlp',
@@ -63,6 +58,7 @@ def __init__(
         use_cache: bool = False,
         init_config: Dict = init_config_defaults,
         fc_type: str = 'torch',
+        tie_word_embeddings: bool = True,
         verbose: Optional[int] = None,
         **kwargs: Any,
     ):
@@ -98,6 +94,16 @@ def __init__(
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
+                rope (bool): Whether to use rotary positional embeddings.
+                rope_theta (int): The base frequency for rope.
+                rope_impl (str): The implementation of rope to use. One of 'hf' (to use the implementation from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) or 'dail' (to use the implementation from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py).
+                rope_dail_config (Dict): The configuration for the dail implementation of rope.
+                    type (str): The type of rotary position embedding to use. Options: 'original' (for https://arxiv.org/pdf/2104.09864.pdf), 'xpos' (for https://arxiv.org/pdf/2212.10554.pdf).
+                    pos_idx_in_fp32 (bool): If True, the position indices [0, ..., seqlen - 1] are in fp32, otherwise they might be in lower precision. A consequence could be, for example, that bf16 rounds position 1995 to 2000, which leads to them having the same positional embedding.
+                    xpos_scale_base (float): The scale base for XPos (if using XPos).
+                rope_hf_config (Dict): A dictionary used to configure rope's scaling behavior (when scaling beyond the training length).
+                    type (str): Can be one of 'no_scaling', 'linear', or 'dynamic'. 'no_scaling' uses the default implementation for rotary embeddings, 'linear' uses linear scaling as proposed by the Reddit user /u/kaiokendev, and 'dynamic' uses Dynamic NTK scaling as proposed by the Reddit users /u/bloc97 and /u/emozilla.
+                    factor (float): Scaling factor to use if using 'linear' or 'dynamic' as rope_scaling.type.
                 kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
             ffn_config (Dict): A dictionary used to configure the model's ffn module:
                 ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
@@ -124,6 +130,7 @@ def __init__(
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
             fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
+            tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
         """
         self.d_model = d_model
         self.n_heads = n_heads
@@ -154,11 +161,17 @@ def __init__(
             del kwargs['name']
         if 'loss_fn' in kwargs:
             del kwargs['loss_fn']
-        if self.attn_config.get('alibi', False):
+        if self.attn_config.get('alibi', False) or self.attn_config.get(
+                'rope', False):
             self.learned_pos_emb = False
             warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to `False.`')
-        super().__init__(**kwargs)
+                f'alibi or rope is turned on, setting `learned_pos_emb` to `False.`'
+            )
+        # tie_word_embeddings is set in Huggingface's PretrainedConfig __init__
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 
         self._validate_config()
 
@@ -168,6 +181,10 @@ def _set_config_defaults(self, config: Dict[str, Any],
         for k, v in config_defaults.items():
             if k not in config:
                 config[k] = v
+            elif isinstance(v, dict):
+                # recursively set default values for any sub-dicts
+                config[k] = self._set_config_defaults(
+                    config[k] if (config[k] is not None) else {}, v)
         return config
 
     def _validate_config(self) -> None:
@@ -210,6 +227,31 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'attn_uses_sequence_id only implemented with torch and triton attention.'
             )
+        if self.attn_config['rope'] and (self.attn_config['rope_impl']
+                                         not in ['dail', 'hf']):
+            raise ValueError(
+                'If rope is being used then rope_impl should be either "dail", or "hf".'
+            )
+        if self.attn_config['rope'] and (
+                self.attn_config['rope_impl']
+                == 'hf') and self.attn_config['rope_hf_config']['type'] not in [
+                    'no_scaling', 'linear', 'dynamic'
+                ]:
+            raise ValueError(
+                'If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".'
+            )
+        if self.attn_config['rope'] and (self.attn_config['rope_impl']
+                                         == 'dail'):
+            if self.attn_config['rope_dail_config']['type'] not in [
+                    'original', 'xpos'
+            ]:
+                raise ValueError(
+                    'If using the dail implementation of rope, the type should be one of "original" or "xpos".'
+                )
+            if not is_flash_v2_installed(v2_version='2.0.1'):
+                raise ImportError(
+                    'If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support'
+                )
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
                 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
@@ -221,9 +263,10 @@ def _validate_config(self) -> None:
             )
         if self.init_config.get('name', None) is None:
             raise ValueError(f"{self.init_config=} 'name' needs to be set.")
-        if not self.learned_pos_emb and not self.attn_config['alibi']:
+        if not (self.learned_pos_emb or self.attn_config['alibi'] or
+                self.attn_config['rope']):
             warnings.warn(
-                f'Positional information not being provided to the model using either learned_pos_emb or alibi.'
+                f'Positional information not being provided to the model using either learned_pos_emb or alibi or rope.'
             )
         if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
             try:
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index e27a26f13a..39f4782274 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -8,7 +8,7 @@
 
 import math
 import warnings
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Dict, List, Mapping, MutableMapping, Optional, Tuple,
                     Union)
 
@@ -24,11 +24,21 @@
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import HuggingFaceModel
 from composer.utils import dist, get_device
+
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+
+if is_flash_v2_installed():
+    try:  # This try...except is needed because transformers requires it despite the 'if' statement above
+        from flash_attn.layers.rotary import \
+            RotaryEmbedding as DAILRotaryEmbedding
+    except Exception as e:
+        raise e
+
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor import (DeviceMesh, Shard, distribute_module,
                                        distribute_tensor)
-from torch.distributed.tensor.parallel import (ColwiseParallel, RowwiseParallel,
+from torch.distributed.tensor.parallel import (RowwiseParallel,
                                                make_input_replicate_1d,
                                                make_sharded_output_tensor,
                                                parallelize_module)
@@ -37,8 +47,16 @@
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.modeling_outputs import (BaseModelOutputWithPast,
                                            CausalLMOutputWithPast)
-
-from llmfoundry.models.layers.attention import attn_bias_shape, build_attn_bias
+from transformers.models.llama.modeling_llama import \
+    LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import \
+    LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding
+from transformers.models.llama.modeling_llama import \
+    LlamaRotaryEmbedding as HFRotaryEmbedding
+
+from llmfoundry.models.layers.attention import (ATTN_CLASS_REGISTRY,
+                                                attn_bias_shape,
+                                                build_attn_bias)
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.layers.custom_embedding import SharedEmbedding
 from llmfoundry.models.layers.fc import FC_CLASS_REGISTRY as FC_CLASS_REGISTRY
@@ -79,6 +97,50 @@
 log = logging.getLogger(__name__)
 
 
+def gen_rotary_embedding(rope_head_dim: int, rope_impl: str, rope_theta: int,
+                         rope_dail_config: dict, rope_hf_config: dict,
+                         max_seq_len: int):
+    if rope_impl == 'dail':
+        return DAILRotaryEmbedding(
+            dim=rope_head_dim,
+            base=rope_theta,
+            interleaved=False,
+            scale_base=rope_dail_config['xpos_scale_base'] if
+            (rope_dail_config['type'] == 'xpos') else None,
+            pos_idx_in_fp32=rope_dail_config['pos_idx_in_fp32'],
+            device=
+            'cpu',  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+        )
+    elif rope_impl == 'hf':
+        if rope_hf_config['type'] == 'no_scaling':
+            return HFRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+        elif rope_hf_config['type'] == 'linear':
+            return HFLinearScalingRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                scaling_factor=rope_hf_config['factor'],
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+        elif rope_hf_config['type'] == 'dynamic':
+            return HFDynamicNTKScalingRotaryEmbedding(
+                rope_head_dim,
+                max_position_embeddings=max_seq_len,
+                base=rope_theta,
+                scaling_factor=rope_hf_config['factor'],
+                device=
+                'cpu'  # FSDP does not materialize modules with meta buffers, hence device is set to cpu
+            )
+    raise ValueError('rope_impl needs to be either dail or hf')
+
+
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
@@ -175,6 +237,18 @@ def __init__(self, config: MPTConfig):
         ])
         self.norm_f = norm_class(config.d_model, device=config.init_device)
 
+        self.rope = config.attn_config['rope']
+        self.rope_impl = None
+        if self.rope:
+            self.rope_impl = config.attn_config['rope_impl']
+            self.rotary_embedding = gen_rotary_embedding(
+                rope_head_dim=config.d_model // config.n_heads,
+                rope_impl=self.rope_impl,
+                rope_theta=config.attn_config['rope_theta'],
+                rope_dail_config=config.attn_config['rope_dail_config'],
+                rope_hf_config=config.attn_config['rope_hf_config'],
+                max_seq_len=self.config.max_seq_len)
+
         if config.init_device != 'meta':
             log.info(
                 f'We recommend using config.init_device="meta" with Composer + FSDP for faster initialization.'
@@ -192,7 +266,6 @@ def __init__(self, config: MPTConfig):
                 mesh_dim_names=['ep', 'tp'],
             )
             new_blocks = nn.ModuleList()
-            torch.set_printoptions(profile='full', sci_mode=False)
             for block in self.blocks:
                 qkv_module = block.get_submodule('attn.Wqkv')
                 oned_mesh = _create_1d_device_mesh(twod_mesh, tp_mesh_dim=1)
@@ -276,10 +349,11 @@ def __init__(self, config: MPTConfig):
         log.debug(self)
         log.debug(f'Using {self.config.init_config["name"]} initialization.')
 
-    def get_input_embeddings(self) -> nn.Embedding:
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
         return self.wte
 
-    def set_input_embeddings(self, value: nn.Embedding) -> None:
+    def set_input_embeddings(
+            self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
         self.wte = value
 
     @torch.no_grad()
@@ -478,8 +552,9 @@ def forward(
             S <= self.config.max_seq_len
         ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
 
-        tok_emb = self.wte(input_ids)
-        if self.learned_pos_emb:
+        rotary_emb_w_meta_info = None
+        x = self.wte(input_ids)
+        if self.learned_pos_emb or self.rope:
             past_position = 0
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
@@ -495,31 +570,44 @@ def forward(
                 if self.attn_impl == 'torch':
                     past_position = past_key_values[0][0].size(3)
 
-            if S + past_position > self.config.max_seq_len:
+            if self.learned_pos_emb and (S + past_position >
+                                         self.config.max_seq_len):
                 raise ValueError(
                     f'Cannot forward input with past sequence length {past_position} and current sequence length '
                     +
                     f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
                 )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                # adjust the position indices to account for padding tokens
-                pos = torch.clamp(
-                    pos - torch.cumsum((~attention_mask).to(torch.int32),
-                                       dim=1)[:, past_position:],
-                    min=0,
-                )
 
-            pos_emb = self.wpe(pos)
-            x = tok_emb + pos_emb
-        else:
-            # ALiBi and NoPE use this path (RoPE will also use this path if / when enabled)
-            x = tok_emb
+            if self.learned_pos_emb or (self.rope and self.rope_impl == 'hf'):
+                pos = torch.arange(
+                    past_position,
+                    S + past_position,
+                    dtype=torch.long,
+                    device=input_ids.device,
+                ).unsqueeze(0)
+                if attention_mask is not None:
+                    # adjust the position indices to account for padding tokens
+                    pos = torch.clamp(
+                        pos - torch.cumsum((~attention_mask).to(torch.int32),
+                                           dim=1)[:, past_position:],
+                        min=0,
+                    )
+                if self.learned_pos_emb:
+                    x = x + self.wpe(pos)
+                elif self.rope and self.rope_impl == 'hf':
+                    rotary_emb_w_meta_info = {
+                        'impl': self.rope_impl,
+                        'rotary_emb': self.rotary_embedding,
+                        'offset_info': pos,
+                        'seq_len': S + past_position,
+                    }
+            elif self.rope and self.rope_impl == 'dail':
+                rotary_emb_w_meta_info = {
+                    'impl': self.rope_impl,
+                    'rotary_emb': self.rotary_embedding,
+                    'offset_info': past_position,
+                    'seq_len': S + past_position,
+                }
 
         if self.embedding_fraction == 1:
             x = self.emb_drop(x)
@@ -556,6 +644,7 @@ def forward(
                 x,
                 past_key_value=past_key_value,
                 attn_bias=attn_bias,
+                rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                 attention_mask=attention_mask,
                 is_causal=self.is_causal,
                 output_attentions=bool(output_attentions),
@@ -604,14 +693,20 @@ class MPTForCausalLM(MPTPreTrainedModel):
 
     def __init__(self, config: MPTConfig):
         super().__init__(config)
-        if not config.tie_word_embeddings:
-            raise ValueError(
-                'MPTForCausalLM only supports tied word embeddings')
-
         log.info(f'Instantiating an MPTForCausalLM model from {__file__}')
 
         self.transformer: MPTModel = MPTModel(config)
 
+        self.lm_head = None
+        if not config.tie_word_embeddings:
+            self.lm_head = nn.Linear(
+                config.d_model,
+                config.vocab_size,
+                bias=False,
+                device=config.init_device,
+            )
+            self.lm_head._fsdp_wrap = True
+
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
@@ -632,19 +727,38 @@ def __init__(self, config: MPTConfig):
                     )
             self.logit_scale = logit_scale
 
-    def get_input_embeddings(self) -> nn.Embedding:
-        return self.transformer.wte
+    def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]:
+        return self.transformer.get_input_embeddings()
 
     def set_input_embeddings(
             self, value: Union[SharedEmbedding, nn.Embedding]) -> None:
-        self.transformer.wte = value
+        self.transformer.set_input_embeddings(value)
 
-    def get_output_embeddings(self) -> nn.Embedding:
-        return self.transformer.wte
+    def get_output_embeddings(
+            self) -> Union[SharedEmbedding, nn.Embedding, nn.Linear]:
+        if self.lm_head is not None:
+            return self.lm_head
+        return self.transformer.get_input_embeddings()
 
     def set_output_embeddings(
-            self, new_embeddings: Union[SharedEmbedding, nn.Embedding]) -> None:
-        self.transformer.wte = new_embeddings
+        self, new_embeddings: Union[SharedEmbedding, nn.Embedding,
+                                    nn.Linear]) -> None:
+        if self.lm_head is not None:
+            self.lm_head = new_embeddings
+        else:
+            if not isinstance(new_embeddings, (SharedEmbedding, nn.Embedding)):
+                raise ValueError(
+                    'new_embeddings must be an instance of SharedEmbedding ' +
+                    f'or nn.Embedding, but got {type(new_embeddings)}.')
+            warnings.warn(
+                'Using `set_output_embeddings` to set the embedding layer of ' +
+                'MPTForCausalLM with tied weights. Given weights are tied, ' +
+                'using `set_input_embeddings` is recommended over using ' +
+                '`set_output_embeddings`.')
+            self.transformer.set_input_embeddings(new_embeddings)
+
+    def tie_weights(self) -> None:
+        self.lm_head = None
 
     def set_decoder(self, decoder: MPTModel) -> None:
         self.transformer = decoder
@@ -688,12 +802,14 @@ def forward(
             use_cache=use_cache,
         )
 
-        # move outputs to same device as weights for token embedding
-        # needed to support HF `device_map`
-        logits = self.transformer.wte(
-            outputs.last_hidden_state.to(self.transformer.wte.weight.device),
-            True,
-        )
+        if self.lm_head is not None:
+            logits = self.lm_head(outputs.last_hidden_state)
+        else:
+            # move outputs to same device as weights for token embedding
+            # needed to support HF `device_map`
+            out = outputs.last_hidden_state
+            out = out.to(self.transformer.wte.weight.device)
+            logits = self.transformer.wte(out, True)
 
         if self.logit_scale is not None:
             if self.logit_scale == 0:
@@ -735,7 +851,35 @@ def fsdp_wrap_fn(self, module: nn.Module) -> bool:
 
     # Activation Checkpointing
     def activation_checkpointing_fn(self, module: nn.Module) -> bool:
-        return isinstance(module, MPTBlock)
+        act_ckpt_list = getattr(self.config, 'activation_checkpointing_target',
+                                None) or ['MPTBlock']
+
+        if 'MPTBlock' in act_ckpt_list or 'mptblock' in act_ckpt_list:
+            if len(act_ckpt_list) > 1:
+                log.info(
+                    'Activation checkpointing MPTBlock only (ignoring other sub-block modules specified in activation_checkpointing_target).'
+                )
+            return isinstance(module, MPTBlock)
+
+        mod_types = ()
+        for mod_name in act_ckpt_list:
+            if mod_name.lower() == 'mptblock':
+                mod_types += (MPTBlock,)
+            elif mod_name in ATTN_CLASS_REGISTRY:
+                mod_types += (ATTN_CLASS_REGISTRY[mod_name],)
+            elif mod_name in FFN_CLASS_REGISTRY:
+                mod_types += (FFN_CLASS_REGISTRY[mod_name],)
+            elif mod_name in NORM_CLASS_REGISTRY:
+                mod_types += (NORM_CLASS_REGISTRY[mod_name],)
+            else:
+                msg = ', '.join(
+                    list(ATTN_CLASS_REGISTRY.keys()) +
+                    list(FFN_CLASS_REGISTRY.keys()) +
+                    list(NORM_CLASS_REGISTRY.keys()) + ['MPTBlock'])
+                raise ValueError(
+                    f'{mod_name} (specified in activation_checkpointing_target) is not a recognized option out of available options {msg}.'
+                )
+        return isinstance(module, mod_types)
 
     def prepare_inputs_for_generation(
         self,
@@ -889,7 +1033,11 @@ def flops_per_batch(self, batch: Mapping) -> int:
         # assume the backward pass is approximately 2x the forward pass
 
         bs, msl = batch['input_ids'].shape[0:2]
-        params_flops_per_token = 2 * self.n_active_params
+        params = self.n_active_params
+        if not self.model.transformer.config.tie_word_embeddings:
+            # embedding layers are lookup tables, therefore are not counted in the FLOP computation
+            params -= self.model.transformer.wte.weight.numel()
+        params_flops_per_token = 2 * params
         params_flops_per_seq = params_flops_per_token * msl
         attn_flops_per_seq = (self.model.config.n_layers * 2 * 2 *
                               (self.model.config.d_model * (msl**2)))
diff --git a/llmfoundry/models/utils/hf_prefixlm_converter.py b/llmfoundry/models/utils/hf_prefixlm_converter.py
index fb9477d909..692fab94c2 100644
--- a/llmfoundry/models/utils/hf_prefixlm_converter.py
+++ b/llmfoundry/models/utils/hf_prefixlm_converter.py
@@ -10,31 +10,14 @@
 and treat the input prompt as the prefix in `generate`.
 """
 
-import math
-import warnings
 from types import MethodType
 from typing import Any, List, MutableMapping, Optional, Tuple, Union
 
 import torch
-from transformers.models.bloom.modeling_bloom import (
-    BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel,
-    CausalLMOutputWithCrossAttentions, CrossEntropyLoss)
-from transformers.models.bloom.modeling_bloom import \
-    _expand_mask as _expand_mask_bloom
-from transformers.models.bloom.modeling_bloom import \
-    _make_causal_mask as _make_causal_mask_bloom
-from transformers.models.bloom.modeling_bloom import logging
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
 from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-from transformers.models.opt.modeling_opt import \
-    _expand_mask as _expand_mask_opt
-from transformers.models.opt.modeling_opt import \
-    _make_causal_mask as _make_causal_mask_opt
-
-logger = logging.get_logger(__name__)
 
 _SUPPORTED_GPT_MODELS = (
     GPT2LMHeadModel,
@@ -223,583 +206,10 @@ def generate(self: CAUSAL_GPT_TYPES, *args: Any, **kwargs: Any):
     return model
 
 
-def _convert_bloom_causal_lm_to_prefix_lm(
-        model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-
-    assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
-
-    # Modified from transformers.models.bloom.modeling_bloom.BloomModel._prepare_attn_mask
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/bloom/modeling_bloom.py#L648
-    def _prepare_attn_mask(
-        self: BloomModel,
-        attention_mask: torch.Tensor,
-        bidirectional_mask: Optional[torch.Tensor],
-        input_shape: Tuple[int, int],
-        past_key_values_length: int,
-    ) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask_bloom(
-                input_shape,
-                device=device,
-                past_key_values_length=past_key_values_length)
-            # Make use of the batch-specific `bidirectional_mask` attribute set
-            # by the parent module in its (new) `forward` method wrapper
-            if bidirectional_mask is not None:
-                # The two masks should have the same size
-                assert attention_mask.shape == bidirectional_mask.shape
-
-                # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-                expanded_bidirectional_mask = _expand_mask_bloom(
-                    bidirectional_mask, tgt_length=src_length)
-                combined_attention_mask = torch.logical_and(
-                    combined_attention_mask, expanded_bidirectional_mask)
-
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask_bloom(attention_mask,
-                                                tgt_length=src_length)
-        combined_attention_mask = (expanded_attn_mask
-                                   if combined_attention_mask is None else
-                                   expanded_attn_mask | combined_attention_mask)
-
-        return combined_attention_mask
-
-    # Modified from transformers.models.bloom.modeling_bloom._prepare_alibi_transformer
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/bloom/modeling_bloom.py#L87
-    def _build_alibi_tensor(
-        self: BloomModel,
-        batch_size: int,
-        query_length: int,
-        key_length: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> torch.Tensor:
-        num_heads = self.config.n_head
-
-        closest_power_of_2 = 2**math.floor(math.log2(num_heads))
-        base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-                            device=device,
-                            dtype=torch.float32)
-        powers = torch.arange(1,
-                              1 + closest_power_of_2,
-                              device=device,
-                              dtype=torch.int32)
-        slopes = torch.pow(base, powers)
-
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(
-                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-                device=device,
-                dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2,
-                                      num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1,
-                                        1 + 2 * num_remaining_heads,
-                                        2,
-                                        device=device,
-                                        dtype=torch.int32)
-            slopes = torch.cat(
-                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-        qa = torch.arange(query_length, device=device,
-                          dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device,
-                          dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(
-            1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1,
-                             -1).reshape(-1, query_length, key_length)
-        return alibi.to(dtype)
-
-    # Modified from transformers.models.bloom.modeling_bloom.BloomModel.forward
-    # Note: The modified code is surrounded with #### START/END #### comments
-    # and one new argument (`bidirectional_mask`) is added to the signature.
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def transformer_forward(
-        self: BloomModel,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments: Any
-    ) -> Union[Tuple[torch.Tensor, ...],
-               BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so
-            # defaulting pop to `False` allows to detect if users were
-            # passing explicitly `None`
-            warnings.warn(
-                '`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' +\
-                'You can safely ignore passing `position_ids`.',
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(
-                f'Got unexpected arguments: {deprecated_arguments}')
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))  # type: ignore
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:  # type: ignore
-            tmp = past_key_values[0][0]  # type: ignore
-            past_key_values_length = tmp.shape[2]  # type: ignore
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        ##### ALL NON-SIGNATURE MODIFICATIONS ARE CONTAINED TO THIS BLOCK [STARTS HERE] #####
-        alibi = self._build_alibi_tensor(
-            batch_size=batch_size,
-            query_length=seq_length,
-            key_length=seq_length_with_past,
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
-
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            bidirectional_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-        ##### ALL NON-SIGNATURE MODIFICATIONS ARE CONTAINED TO THIS BLOCK [ENDS HERE] #####
-
-        for i, (block,
-                layer_past) in enumerate(zip(self.h,
-                                             past_key_values)):  # type: ignore
-
-            if output_hidden_states:
-                hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst  # type: ignore
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module: torch.nn.Module):
-
-                    def custom_forward(*inputs: Any):
-                        # None for past_key_value
-                        return module(*inputs,
-                                      use_cache=use_cache,
-                                      output_attentions=output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(  # type: ignore
-                    create_custom_forward(block),
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    head_mask[i],  # type: ignore
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],  # type: ignore
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)  # type: ignore
-
-            if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)  # type: ignore
-                all_self_attentions = all_self_attentions + oa  # type: ignore
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst  # type: ignore
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states, presents, all_hidden_states, all_self_attentions
-            ] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    # Make it so model.transformer has the new helper methods and new
-    # `forward` method
-    setattr(model.transformer, '_prepare_attn_mask',
-            MethodType(_prepare_attn_mask, model.transformer))
-    setattr(model.transformer, '_build_alibi_tensor',
-            MethodType(_build_alibi_tensor, model.transformer))
-    setattr(model.transformer, 'forward',
-            MethodType(transformer_forward, model.transformer))
-
-    # In order to actually use the new argument we've added to
-    # model.transformer, we need to update the parent module's `forward` to
-    # accept/pass the same new argument.
-    # We add 2 lines to handle that change.
-    # Both lines are tagged with "# WE'RE ADDING A NEW ARGUMENT!"
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-
-    def forward(
-        self: BloomForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        # WE'RE ADDING A NEW ARGUMENT! (Change 1/2)
-        bidirectional_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments: Any,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so
-            # defaulting pop to `False` allows to detect if users were passing
-            # explicitly `None`
-            warnings.warn(
-                '`position_ids` have no functionality in BLOOM and will be removed ' +\
-                'in v5.0.0. You can safely ignore passing `position_ids`.',
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(
-                f'Got unexpected arguments: {deprecated_arguments}')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            # WE'RE ADDING A NEW ARGUMENT! (Change 2/2)
-            bidirectional_mask=bidirectional_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size),
-                shift_labels.view(batch_size * seq_length))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    # To handle generation, re-write `prepare_inputs_for_generation` to
-    # implement the bidirectional logic.
-    def prepare_inputs_for_generation(self: BloomForCausalLM,
-                                      input_ids: torch.LongTensor,
-                                      past: Optional[torch.Tensor] = None,
-                                      attention_mask: Optional[
-                                          torch.Tensor] = None,
-                                      **kwargs: Any) -> dict:
-        del kwargs  # unused
-        # only last token for input_ids if past is not None
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)  # type: ignore
-            # We can turn off bidirectional masking after the prefix
-            # has been encoded into `past`
-            bidirectional_mask = None
-
-            # the cache may be in the standard format (e.g. in contrastive
-            # search), convert to bloom's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_bloom_cache(past)
-
-        else:
-            # If we're here, `input_ids` contains the prefix. Encode it with
-            # bidirectional attention.
-            bidirectional_mask = torch.ones_like(input_ids)
-
-        return {
-            'input_ids': input_ids,
-            'past_key_values': past,
-            # "use_cache": kwargs.get("use_cache"),
-            # Requires this. TODO(Alex): Confirm this supports other decoding strategies.
-            'use_cache': True,
-            'attention_mask': attention_mask,
-            'bidirectional_mask': bidirectional_mask,
-        }
-
-    # Register the new `forward` and `prepare_inputs_for_generation` methods
-    # with the model
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'prepare_inputs_for_generation',
-            MethodType(prepare_inputs_for_generation, model))
-
-    # Finally, tag the model so that this conversion cannot happen again.
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-
-def _convert_opt_causal_lm_to_prefix_lm(
-        model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
-    if hasattr(model, '_prefix_lm_converted'):
-        return model
-
-    assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
-
-    # Rename methods to allow:
-    #  - new `forward` to wrap original `forward`
-    #  - new `generate` to wrap original `generate`
-    setattr(model, '_original_forward', getattr(model, 'forward'))
-    setattr(model, '_original_generate', getattr(model, 'generate'))
-
-    model.model.decoder.bidirectional_mask = None
-
-    # Modified from transformers.models.bloom.modeling_opt.OPTDecoder._prepare_decoder_attn_mask
-    # https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/models/opt/modeling_opt.py#L532
-    def _prepare_decoder_attention_mask(self: torch.nn.Module,
-                                        attention_mask: Optional[torch.Tensor],
-                                        input_shape: Tuple[int, int],
-                                        inputs_embeds: Optional[torch.Tensor],
-                                        past_key_values_length: int):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            assert inputs_embeds is not None
-            # 'g' indicates generation mode. Causal mask replaced with 0.
-            if self.bidirectional_mask == 'g':
-                bsz, src_length = input_shape
-                combined_attention_mask = torch.zeros(
-                    (bsz, 1, src_length, src_length + past_key_values_length),
-                    dtype=inputs_embeds.dtype,
-                    device=inputs_embeds.device)
-            else:
-                combined_attention_mask = _make_causal_mask_opt(
-                    input_shape,
-                    inputs_embeds.dtype,
-                    past_key_values_length=past_key_values_length).to(
-                        inputs_embeds.device)
-
-                # Make use of the batch-specific `bidirectional_mask` attribute
-                # set by the parent module in its (new) `forward` method wrapper
-                if self.bidirectional_mask is not None:
-                    assert attention_mask is not None
-                    # The two masks should have the same size
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-
-                    # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-                    expanded_bidirectional_mask = _expand_mask_opt(
-                        self.bidirectional_mask,
-                        inputs_embeds.dtype,
-                        tgt_len=input_shape[-1]).to(inputs_embeds.device)
-                    combined_attention_mask = torch.maximum(
-                        expanded_bidirectional_mask, combined_attention_mask)
-
-        if attention_mask is not None:
-            assert inputs_embeds is not None
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask_opt(attention_mask,
-                                                  inputs_embeds.dtype,
-                                                  tgt_len=input_shape[-1]).to(
-                                                      inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    # Make it so model.model.decoder uses the above `_prepare_decoder_attn_mask`
-    # in place of the original method
-    setattr(model.model.decoder, '_prepare_decoder_attention_mask',
-            MethodType(_prepare_decoder_attention_mask, model.model.decoder))
-
-    def forward(
-        self: OPTForCausalLM,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        bidirectional_mask: Optional[torch.ByteTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-
-        def call_og_forward():
-            return self._original_forward(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-        if bidirectional_mask is None:
-            # This wrapper is a no-op if bidirectional masks are not supplied
-            return call_og_forward()
-
-        # Temporarily set `bidirectional_mask` in the child module
-        self.model.decoder.bidirectional_mask = bidirectional_mask
-
-        # Apply the original forward method (the model will use the mask that
-        # was just set)
-        try:
-            outputs = call_og_forward()
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-
-        # Reset the `bidirectional_mask` attribute to None
-        self.model.decoder.bidirectional_mask = None
-
-        # Return the outputs
-        return outputs
-
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Any):
-        """Wraps original generate to enable PrefixLM-style attention."""
-        # Flag the child module to use generation-style attention masking
-        self.model.decoder.bidirectional_mask = 'g'
-
-        # Collect outputs using the model's original forward method
-        try:
-            output = self._original_generate(*args, **kwargs)
-        except:
-            self.model.decoder.bidirectional_mask = None
-            raise
-
-        # Reset the `bidirectional_mask` attribute to None
-        self.model.decoder.bidirectional_mask = None
-
-        # Return the output
-        return output
-
-    # Replace `forward` and `generate` with the new wrappers
-    setattr(model, 'forward', MethodType(forward, model))
-    setattr(model, 'generate', MethodType(generate, model))
-
-    # Finally, tag the model so that this conversion cannot happen again.
-    setattr(model, '_prefix_lm_converted', True)
-    return model
-
-
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM,
-                                                OPTForCausalLM)
+_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS
 
 CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM,
-                        GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
+                        GPTNeoXForCausalLM]
 
 
 def convert_hf_causal_lm_to_prefix_lm(
@@ -811,8 +221,6 @@ def convert_hf_causal_lm_to_prefix_lm(
         - `GPTNeoForCausalLM`
         - `GPTNeoXForCausalLM`
         - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
 
     Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
     `generate` method and/or select underlying methods depending on the model class.
@@ -862,13 +270,6 @@ def convert_hf_causal_lm_to_prefix_lm(
     """
     if isinstance(model, _SUPPORTED_GPT_MODELS):
         return _convert_gpt_causal_lm_to_prefix_lm(model)
-
-    elif isinstance(model, BloomForCausalLM):
-        return _convert_bloom_causal_lm_to_prefix_lm(model)
-
-    elif isinstance(model, OPTForCausalLM):
-        return _convert_opt_causal_lm_to_prefix_lm(model)
-
     else:
         raise TypeError(
             f'Cannot convert model to Prefix LM. ' +\
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 41518a582a..650d469ecf 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -26,7 +27,7 @@ def __init__(self,
                  eos_token: Optional[str] = '<|endoftext|>',
                  bos_token: Optional[str] = '<|endoftext|>',
                  pad_token: Optional[str] = None,
-                 **kwargs: Dict[str, Any]):
+                 **kwargs: Any):
         """Constructor creates a tiktoken tokenizer to use as the underlying.
 
         tokenizer.
@@ -49,6 +50,23 @@ def __init__(self,
             raise ImportError(
                 'You need to install tiktoken to use TiktokenTokenizerWrapper.')
 
+        # Workaround to make tiktokenizer picklable.
+        # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
+        # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
+        import copyreg
+        import functools
+
+        from tiktoken import Encoding  # type: ignore (thirdParty)
+
+        def pickle_Encoding(enc: Encoding):
+            return (functools.partial(Encoding,
+                                      enc.name,
+                                      pat_str=enc._pat_str,
+                                      mergeable_ranks=enc._mergeable_ranks,
+                                      special_tokens=enc._special_tokens), ())
+
+        copyreg.pickle(Encoding, pickle_Encoding)
+
         if model_name is not None and encoding_name is not None:
             raise ValueError(
                 'You need to specify either model_name or encoding_name, not both.'
@@ -90,7 +108,17 @@ def is_fast(self) -> bool:
         return False
 
     def get_vocab(self) -> Dict[str, int]:
-        """Returns vocab as a dict."""
+        """Returns vocab as a dict.
+
+        Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
+        Most uses do not need to use get_vocab, so this is not a priority to fix.
+        """
+        warnings.warn(
+            'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
+            +
+            ' It will be called once init just to get the size of the vocab inside the base class.'
+        )
+
         vocab = {}
         for i in range(self.vocab_size):
             try:
@@ -101,6 +129,24 @@ def get_vocab(self) -> Dict[str, int]:
             except KeyError:
                 pass
 
+        # As far as I can tell, we don't require get_vocab to completely work,
+        # but when using additional_special_tokens, Hugging Face determines the next
+        # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
+        extra_id_index = 0
+        candidate_extra_id = f'<extra_id_{extra_id_index}>'
+        indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
+            vocab.values())
+
+        # Add enough indices to make get_vocab() the right length
+        for index_to_add in indices_to_fill_in:
+            # Make sure we don't overwrite a token that already exists
+            while candidate_extra_id in vocab:
+                extra_id_index += 1
+                candidate_extra_id = f'<extra_id_{extra_id_index}>'
+
+            # Get an index to add and add the item
+            vocab[candidate_extra_id] = index_to_add
+
         return vocab
 
     def _tokenize(self, text: str) -> List[int]:
@@ -155,7 +201,7 @@ def convert_ids_to_tokens(
         """
         if isinstance(ids, int):
             if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
+                return str(self.added_tokens_decoder[ids])
 
             return self._convert_id_to_token(ids)
 
@@ -171,7 +217,7 @@ def convert_ids_to_tokens(
             if index in self.added_tokens_decoder:
                 tokens.append(self.encoding.decode(current_stream))
                 current_stream = []
-                tokens.append(self.added_tokens_decoder[index])
+                tokens.append(str(self.added_tokens_decoder[index]))
             else:
                 current_stream.append(index)
 
diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py
index 38cc562c9d..7abe4dcf75 100644
--- a/llmfoundry/utils/__init__.py
+++ b/llmfoundry/utils/__init__.py
@@ -11,6 +11,8 @@
     from llmfoundry.utils.config_utils import (calculate_batch_size_info,
                                                log_config, pop_config,
                                                update_batch_size_info)
+    from llmfoundry.utils.model_download_utils import (
+        download_from_cache_server, download_from_hf_hub)
 except ImportError as e:
     raise ImportError(
         'Please make sure to pip install . to get requirements for llm-foundry.'
@@ -26,6 +28,8 @@
     'build_tokenizer',
     'calculate_batch_size_info',
     'convert_and_save_ft_weights',
+    'download_from_cache_server',
+    'download_from_hf_hub',
     'get_hf_tokenizer_from_composer_state_dict',
     'update_batch_size_info',
     'log_config',
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index f027afb0ce..dedf6f5434 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -73,7 +73,8 @@ def build_icl_data_and_gauntlet(
     return icl_evaluators, logger_keys, eval_gauntlet_cb
 
 
-def build_callback(name: str, kwargs: Dict[str, Any]) -> Callback:
+def build_callback(name: str, kwargs: Union[DictConfig, Dict[str,
+                                                             Any]]) -> Callback:
     if name == 'lr_monitor':
         return LRMonitor()
     elif name == 'memory_monitor':
@@ -117,6 +118,8 @@ def build_callback(name: str, kwargs: Dict[str, Any]) -> Callback:
     elif name == 'early_stopper':
         return EarlyStopper(**kwargs)
     elif name == 'hf_checkpointer':
+        if isinstance(kwargs, DictConfig):
+            kwargs = om.to_object(kwargs)  # pyright: ignore
         return HuggingFaceCheckpointer(**kwargs)
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
@@ -188,6 +191,14 @@ def build_tokenizer(
     os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
+    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+
+    if dist.is_available() and dist.is_initialized(
+    ) and dist.get_world_size() > 1:
+        # Make sure the tokenizer files are downloaded and cached first by local rank 0
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            pass
+
     if tokenizer_name.startswith('tiktoken'):
         tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
     else:
@@ -202,6 +213,17 @@ def build_tokenizer(
             int(1e30),
         )
 
+    if dist.is_available() and dist.is_initialized(
+    ) and dist.get_world_size() > 1:
+        if dist.get_local_rank() == 0:
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_tokenizer_setup')
+
+        dist.barrier()
+
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+
     return tokenizer
 
 
diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index 0627cec4cd..35e77eab6c 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -19,7 +19,8 @@
 
 import numpy as np
 import sentencepiece as spm
-from transformers import AutoTokenizer, PreTrainedTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 log = logging.getLogger(__name__)
 
@@ -35,8 +36,9 @@ def _get_weight_data_type(data_type: str):
 
 # TODO: move this functionality to composer once the bug fixes are upstreamed
 def get_hf_tokenizer_from_composer_state_dict(
-        state_dict: Dict[str, Any],
-        tokenizer_save_dir: Optional[str] = None
+    state_dict: Dict[str, Any],
+    trust_remote_code: bool,
+    tokenizer_save_dir: Optional[str] = None,
 ) -> Optional[PreTrainedTokenizer]:
     if 'state' not in state_dict:
         raise RuntimeError(
@@ -85,7 +87,8 @@ def get_hf_tokenizer_from_composer_state_dict(
                 with open(tokenizer_file_path, 'wb') as _tmp_file:
                     _tmp_file.write(s.serialized_model_proto())
 
-        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
+        hf_tokenizer = load_tokenizer(tokenizer_save_dir,
+                                      trust_remote_code=trust_remote_code)
 
         # remove 'name_or_path'
         hf_tokenizer.name_or_path = ''
@@ -94,6 +97,20 @@ def get_hf_tokenizer_from_composer_state_dict(
     return hf_tokenizer
 
 
+def load_tokenizer(
+    tokenizer_save_dir: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    try:
+        return AutoTokenizer.from_pretrained(
+            tokenizer_save_dir, trust_remote_code=trust_remote_code)
+    except ValueError as e:
+        raise ValueError(
+            f'Got error while loading tokenizer with trust_remote_code={trust_remote_code}: {e}. '
+            +
+            'If accessing a tokenizer defined outside of the transformers module,'
+            + ' please use --trust_remote_code.')
+
+
 def _write_zero_bias(weight_name: str, weight_file_path: str,
                      bias_shape: Union[Tuple[int, ...], int]) -> None:
     """Write zeros for bias when converting MPT to FasterTransformer weights.
diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
new file mode 100644
index 0000000000..2104455e0f
--- /dev/null
+++ b/llmfoundry/utils/model_download_utils.py
@@ -0,0 +1,235 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utility functions for downloading models."""
+import copy
+import logging
+import os
+import time
+import warnings
+from http import HTTPStatus
+from typing import Optional
+from urllib.parse import urljoin
+
+import huggingface_hub as hf_hub
+import requests
+import tenacity
+from bs4 import BeautifulSoup
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
+from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
+
+DEFAULT_IGNORE_PATTERNS = [
+    '*.ckpt',
+    '*.h5',
+    '*.msgpack',
+]
+PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*'
+SAFE_WEIGHTS_PATTERN = 'model*.safetensors*'
+
+log = logging.getLogger(__name__)
+
+
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type(
+    (ValueError, hf_hub.utils.RepositoryNotFoundError)),
+                stop=tenacity.stop_after_attempt(3),
+                wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_hf_hub(
+    repo_id: str,
+    save_dir: Optional[str] = None,
+    prefer_safetensors: bool = True,
+    token: Optional[str] = None,
+):
+    """Downloads model files from a Hugging Face Hub model repo.
+
+    Only supports models stored in Safetensors and PyTorch formats for now. If both formats are available, only the
+    Safetensors weights will be downloaded unless `prefer_safetensors` is set to False.
+
+    Args:
+        repo_id (str): The Hugging Face Hub repo ID.
+        save_dir (str, optional): The path to the directory where the model files will be downloaded. If `None`, reads
+            from the `HUGGINGFACE_HUB_CACHE` environment variable or uses the default Hugging Face Hub cache directory.
+        prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are
+            available. Defaults to True.
+        token (str, optional): The HuggingFace API token. If not provided, the token will be read from the
+            `HUGGING_FACE_HUB_TOKEN` environment variable.
+
+    Raises:
+        RepositoryNotFoundError: If the model repo doesn't exist or the token is unauthorized.
+        ValueError: If the model repo doesn't contain any supported model weights.
+    """
+    repo_files = set(hf_hub.list_repo_files(repo_id))
+
+    # Ignore TensorFlow, TensorFlow 2, and Flax weights as they are not supported by Composer.
+    ignore_patterns = copy.deepcopy(DEFAULT_IGNORE_PATTERNS)
+
+    safetensors_available = (SAFE_WEIGHTS_NAME in repo_files or
+                             SAFE_WEIGHTS_INDEX_NAME in repo_files)
+    pytorch_available = (PYTORCH_WEIGHTS_NAME in repo_files or
+                         PYTORCH_WEIGHTS_INDEX_NAME in repo_files)
+
+    if safetensors_available and pytorch_available:
+        if prefer_safetensors:
+            log.info(
+                'Safetensors available and preferred. Excluding pytorch weights.'
+            )
+            ignore_patterns.append(PYTORCH_WEIGHTS_PATTERN)
+        else:
+            log.info(
+                'Pytorch available and preferred. Excluding safetensors weights.'
+            )
+            ignore_patterns.append(SAFE_WEIGHTS_PATTERN)
+    elif safetensors_available:
+        log.info('Only safetensors available. Ignoring weights preference.')
+    elif pytorch_available:
+        log.info('Only pytorch available. Ignoring weights preference.')
+    else:
+        raise ValueError(
+            f'No supported model weights found in repo {repo_id}.' +
+            ' Please make sure the repo contains either safetensors or pytorch weights.'
+        )
+
+    download_start = time.time()
+    hf_hub.snapshot_download(repo_id,
+                             cache_dir=save_dir,
+                             ignore_patterns=ignore_patterns,
+                             token=token)
+    download_duration = time.time() - download_start
+    log.info(
+        f'Downloaded model {repo_id} from Hugging Face Hub in {download_duration} seconds'
+    )
+
+
+def _extract_links_from_html(html: str):
+    """Extracts links from HTML content.
+
+    Args:
+        html (str): The HTML content
+
+    Returns:
+        list[str]: A list of links to download.
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+    links = [a['href'] for a in soup.find_all('a')]
+    return links
+
+
+def _recursive_download(
+    session: requests.Session,
+    base_url: str,
+    path: str,
+    save_dir: str,
+    ignore_cert: bool = False,
+):
+    """Downloads all files/subdirectories from a directory on a remote server.
+
+    Args:
+        session: A requests.Session through which to make requests to the remote server.
+        url (str): The base URL where the files are located.
+        path (str): The path from the base URL to the files to download. The full URL for the download is equal to
+            '<base_url>/<path>'.
+        save_dir (str): The directory to save downloaded files to.
+        ignore_cert (bool): Whether or not to ignore the validity of the SSL certificate of the remote server.
+            Defaults to False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+
+    Raises:
+        PermissionError: If the remote server returns a 401 Unauthorized status code.
+        ValueError: If the remote server returns a 404 Not Found status code.
+        RuntimeError: If the remote server returns a status code other than 200 OK or 401 Unauthorized.
+    """
+    url = urljoin(base_url, path)
+    response = session.get(url, verify=(not ignore_cert))
+
+    if response.status_code == HTTPStatus.UNAUTHORIZED:
+        raise PermissionError(
+            f'Not authorized to download file from {url}. Received status code {response.status_code}. '
+        )
+    elif response.status_code == HTTPStatus.NOT_FOUND:
+        raise ValueError(
+            f'Could not find file at {url}. Received status code {response.status_code}'
+        )
+    elif response.status_code != HTTPStatus.OK:
+        raise RuntimeError(
+            f'Could not download file from {url}. Received unexpected status code {response.status_code}'
+        )
+
+    # Assume that the URL points to a file if it does not end with a slash.
+    if not path.endswith('/'):
+        save_path = os.path.join(save_dir, path)
+        parent_dir = os.path.dirname(save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+
+            log.info(f'Downloaded file {save_path}')
+            return
+
+    # If the URL is a directory, the response should be an HTML directory listing that we can parse for additional links
+    # to download.
+    child_links = _extract_links_from_html(response.content.decode())
+    for child_link in child_links:
+        _recursive_download(session,
+                            base_url,
+                            urljoin(path, child_link),
+                            save_dir,
+                            ignore_cert=ignore_cert)
+
+
+@tenacity.retry(retry=tenacity.retry_if_not_exception_type(
+    (PermissionError, ValueError)),
+                stop=tenacity.stop_after_attempt(3),
+                wait=tenacity.wait_exponential(min=1, max=10))
+def download_from_cache_server(
+    model_name: str,
+    cache_base_url: str,
+    save_dir: str,
+    token: Optional[str] = None,
+    ignore_cert: bool = False,
+):
+    """Downloads Hugging Face models from a mirror file server.
+
+    The file server is expected to store the files in the same structure as the Hugging Face cache
+    structure. See https://huggingface.co/docs/huggingface_hub/guides/manage-cache.
+
+    Args:
+        model_name: The name of the model to download. This should be the same as the repository ID in the Hugging Face
+            Hub.
+        cache_base_url: The base URL of the cache file server. This function will attempt to download all of the blob
+            files from `<cache_base_url>/<formatted_model_name>/blobs/`, where `formatted_model_name` is equal to
+            `models/<model_name>` with all slashes replaced with `--`.
+        save_dir: The directory to save the downloaded files to.
+        token: The Hugging Face API token. If not provided, the token will be read from the `HUGGING_FACE_HUB_TOKEN`
+            environment variable.
+        ignore_cert: Whether or not to ignore the validity of the SSL certificate of the remote server. Defaults to
+            False.
+            WARNING: Setting this to true is *not* secure, as no certificate verification will be performed.
+    """
+    formatted_model_name = f'models/{model_name}'.replace('/', '--')
+    with requests.Session() as session:
+        session.headers.update({'Authorization': f'Bearer {token}'})
+
+        download_start = time.time()
+
+        # Temporarily suppress noisy SSL certificate verification warnings if ignore_cert is set to True
+        with warnings.catch_warnings():
+            if ignore_cert:
+                warnings.simplefilter('ignore', category=InsecureRequestWarning)
+
+            # Only downloads the blobs in order to avoid downloading model files twice due to the
+            # symlnks in the Hugging Face cache structure:
+            _recursive_download(
+                session,
+                cache_base_url,
+                # Trailing slash to indicate directory
+                f'{formatted_model_name}/blobs/',
+                save_dir,
+                ignore_cert=ignore_cert,
+            )
+        download_duration = time.time() - download_start
+        log.info(
+            f'Downloaded model {model_name} from cache server in {download_duration} seconds'
+        )
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index ae8f57abb6..93d46f57e3 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -56,7 +56,10 @@ parameters:
       allow_pad_trimming: false
       decoder_only_format: true
       shuffle: true
-      # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+      # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+      # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+      # # of the dataset.
+      # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
       # # to profile this run's optimal packing_ratio as it depends on GPU count,
       # # batch size, sequence length
       # packing_ratio:
diff --git a/pyproject.toml b/pyproject.toml
index a2fcec3eed..0b078120b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,13 +86,6 @@ filterwarnings = [
     'ignore::DeprecationWarning:tensorboard',  # ignore tensorboard
 ]
 
-# Enable logging for pytest
-log_cli = true
-log_cli_level = "INFO"
-log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
-log_cli_date_format = "%Y-%m-%d %H:%M:%S"
-
-
 # Yapf
 [tool.yapf]
 # Align closing bracket with visual indentation.
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index f07942ba10..02a5d1f862 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -145,7 +145,8 @@ def evaluate_model(
 
     if eval_gauntlet_df is None and eval_gauntlet_callback is not None:
         eval_gauntlet_df = pd.DataFrame(
-            columns=['model_name', 'average'] +
+            columns=['model_name'] +
+            [avg for avg in eval_gauntlet_callback.averages] +
             [t.name for t in eval_gauntlet_callback.categories])
 
     load_path = model_cfg.get('load_path', None)
@@ -314,23 +315,17 @@ def main(cfg: DictConfig):
         if eval_gauntlet_df is not None and eval_gauntlet_callback is not None:
             assert composite_scores is not None
             row = {'model_name': model_cfg['model_name']}
-            row.update({
-                t.name:
-                composite_scores.get(f'icl/metrics/eval_gauntlet/{t.name}',
-                                     None)
-                for t in eval_gauntlet_callback.categories
-            })
-            row.update({
-                'average':
-                    composite_scores[f'icl/metrics/eval_gauntlet/average']
-            })
+            row.update(
+                {k.split('/')[-1]: v for k, v in composite_scores.items()})
             eval_gauntlet_df = pd.concat(
                 [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True)
 
             print(f'Printing gauntlet results for all models')
+
             print(
                 eval_gauntlet_df.sort_values(
-                    'average', ascending=False).to_markdown(index=False))
+                    list(eval_gauntlet_callback.averages.keys())[0],
+                    ascending=False).to_markdown(index=False))
         print(f'Printing complete results for all models')
         assert models_df is not None
         print(models_df.to_markdown(index=False))
diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml
index 87e01fd44c..791023abcf 100644
--- a/scripts/eval/yamls/eval_gauntlet.yaml
+++ b/scripts/eval/yamls/eval_gauntlet.yaml
@@ -2,6 +2,27 @@ eval_gauntlet:
   weighting: EQUAL
   subtract_random_baseline: true
   rescale_accuracy: true
+  averages:
+    core_average:
+    - world_knowledge
+    - commonsense_reasoning
+    - language_understanding
+    - symbolic_problem_solving
+    - reading_comprehension
+    - programming
+    lm_task_average:
+    - world_knowledge_lm_task_subscore
+    - commonsense_reasoning_lm_task_subscore
+    - language_understanding_lm_task_subscore
+    - symbolic_problem_solving_lm_task_subscore
+    - reading_comprehension_lm_task_subscore
+    lite_average:
+    - world_knowledge_lite
+    - commonsense_reasoning_lite
+    - language_understanding_lite
+    - symbolic_problem_solving_lite
+    - reading_comprehension_lite
+    - programming_lite
   categories:
   - name: world_knowledge
     benchmarks:
@@ -112,32 +133,32 @@ eval_gauntlet:
     - name: boolq
       num_fewshot: 10
       random_baseline: 0.5
-  - name: programming
-    benchmarks:
-    - name: human_eval
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_cpp
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_js
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_return_simple
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_return_complex
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_25
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_50
-      num_fewshot: 0
-      random_baseline: 0.0
-    - name: human_eval_75
-      num_fewshot: 0
-      random_baseline: 0.0
+  # - name: programming
+  #   benchmarks:
+  #   - name: human_eval
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_cpp
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_js
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_return_simple
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_return_complex
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_25
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_50
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
+  #   - name: human_eval_75
+  #     num_fewshot: 0
+  #     random_baseline: 0.0
   - name: world_knowledge_lm_task_subscore
     benchmarks:
     - name: jeopardy
@@ -237,8 +258,3 @@ eval_gauntlet:
     - name: squad
       num_fewshot: 10
       random_baseline: 0
-  - name: programming_lite
-    benchmarks:
-    - name: human_eval
-      num_fewshot: 0
-      random_baseline: 0.0
diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml
index 05169818d9..759af8239a 100644
--- a/scripts/eval/yamls/hf_eval.yaml
+++ b/scripts/eval/yamls/hf_eval.yaml
@@ -43,5 +43,5 @@ device_eval_batch_size: 4
 #   forward_prefetch: True
 #   limit_all_gathers: True
 
-icl_tasks: 'eval/yamls/tasks.yaml'
+icl_tasks: 'eval/yamls/tasks_light.yaml'
 eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml
index 6b66c116ea..737b08ebeb 100644
--- a/scripts/eval/yamls/tasks.yaml
+++ b/scripts/eval/yamls/tasks.yaml
@@ -173,67 +173,67 @@ icl_tasks:
   num_fewshot: [10]
   icl_task_type: multiple_choice
   continuation_delimiter: "\nAnswer: " # this separates questions from answers
--
-  label: human_eval
-  dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_cpp
-  dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_js
-  dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_simple
-  dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_complex
-  dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_25
-  dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_50
-  dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_75
-  dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  num_beams: 20
-  batch_size: 1
-  icl_task_type: code_evaluation
+# -
+#   label: human_eval
+#   dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_cpp
+#   dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_js
+#   dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_return_simple
+#   dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_return_complex
+#   dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_25
+#   dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_50
+#   dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
+# -
+#   label: human_eval_75
+#   dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
+#   num_fewshot: [0]
+#   pass_at_k: 1
+#   num_beams: 20
+#   batch_size: 1
+#   icl_task_type: code_evaluation
diff --git a/scripts/inference/benchmarking/yamls/1b.yaml b/scripts/inference/benchmarking/yamls/1b.yaml
index f94aa3d806..d1cfb3c913 100644
--- a/scripts/inference/benchmarking/yamls/1b.yaml
+++ b/scripts/inference/benchmarking/yamls/1b.yaml
@@ -12,7 +12,6 @@ tokenizer:
 model:
   name: mpt_causal_lm
   init_device: cpu
-  tokenizer_name: ${tokenizer_name}
   d_model: 2048
   n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
   n_layers: 24
diff --git a/scripts/inference/benchmarking/yamls/7b.yaml b/scripts/inference/benchmarking/yamls/7b.yaml
index 55e9ae8413..f57ed2657f 100644
--- a/scripts/inference/benchmarking/yamls/7b.yaml
+++ b/scripts/inference/benchmarking/yamls/7b.yaml
@@ -12,7 +12,6 @@ tokenizer:
 model:
   name: mpt_causal_lm
   init_device: cpu
-  tokenizer_name: ${tokenizer_name}
   d_model: 4096
   n_heads: 32
   n_layers: 32
diff --git a/scripts/inference/convert_composer_mpt_to_ft.py b/scripts/inference/convert_composer_mpt_to_ft.py
index 79275030b3..f59eb6005a 100644
--- a/scripts/inference/convert_composer_mpt_to_ft.py
+++ b/scripts/inference/convert_composer_mpt_to_ft.py
@@ -67,6 +67,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
         infer_gpu_num: int,
         save_dir: str,
+        trust_remote_code: bool,
         output_precision: str = 'fp32',
         local_checkpoint_save_location: Optional[Union[Path,
                                                        str]] = None) -> None:
@@ -79,6 +80,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by Composer.
         infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         save_dir (str): Path of the directory to save the checkpoint in FT format.
         output_precision (str, optional): The precision of the output weights saved to the FasterTransformer model. Can be either ``fp32`` or ``fp16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
@@ -125,7 +127,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
     print('#' * 30)
     print('Extracting HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is None:
         print('Warning! No HF Tokenizer found!')
 
@@ -206,6 +208,10 @@ def parse_args() -> Namespace:
         'Data type of weights in the FasterTransformer output model. Input checkpoint weights will be converted to this dtype.',
         choices=['fp32', 'fp16'],
         default='fp32')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -229,4 +235,5 @@ def parse_args() -> Namespace:
         infer_gpu_num=args.infer_gpu_num,
         save_dir=save_dir,
         output_precision=args.output_precision,
-        local_checkpoint_save_location=args.local_checkpoint_save_location)
+        local_checkpoint_save_location=args.local_checkpoint_save_location,
+        trust_remote_code=args.trust_remote_code)
diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
index 5625a3b046..1b43762473 100644
--- a/scripts/inference/convert_composer_to_hf.py
+++ b/scripts/inference/convert_composer_to_hf.py
@@ -16,6 +16,7 @@
 
 from llmfoundry import MPTConfig, MPTForCausalLM
 from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
+from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
@@ -23,6 +24,7 @@
 def write_huggingface_pretrained_from_composer_checkpoint(
     checkpoint_path: Union[Path, str],
     output_path: Union[Path, str],
+    trust_remote_code: bool,
     output_precision: str = 'fp32',
     local_checkpoint_save_location: Optional[Union[Path, str]] = None
 ) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
@@ -63,6 +65,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
         output_path (Union[Path, str]): Path to the folder to write the output to.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
                                                                                 If the input ``checkpoint_path`` is already a local path, this will be a symlink.
@@ -110,7 +113,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     print('#' * 30)
     print('Saving HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is not None:
         hf_tokenizer.save_pretrained(output_path)
         print(hf_tokenizer)
@@ -157,6 +160,10 @@ def parse_args() -> Namespace:
                         default='fp32')
     parser.add_argument('--hf_repo_for_upload', type=str, default=None)
     parser.add_argument('--test_uploaded_model', action='store_true')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -179,6 +186,7 @@ def convert_composer_to_hf(args: Namespace) -> None:
     config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path=args.composer_path,
         output_path=local_folder_path,
+        trust_remote_code=args.trust_remote_code,
         output_precision=args.output_precision,
         local_checkpoint_save_location=args.local_checkpoint_save_location)
 
@@ -206,7 +214,9 @@ def convert_composer_to_hf(args: Namespace) -> None:
     loaded_hf_model.save_pretrained(local_folder_path)
 
     print(f'Loading tokenizer from {local_folder_path}')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
+
+    tokenizer = load_tokenizer(local_folder_path,
+                               trust_remote_code=args.trust_remote_code)
     tokenizer.save_pretrained(local_folder_path)
 
     # Only need to edit files for MPT because it has custom code
diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py
index 96592ca477..45ddc6b63e 100644
--- a/scripts/inference/hf_generate.py
+++ b/scripts/inference/hf_generate.py
@@ -217,6 +217,7 @@ def main(args: Namespace) -> None:
         if device is not None:
             print(f'Placing model on {device=}...')
             model.to(device)
+        model.to(model_dtype)
     except Exception as e:
         raise RuntimeError(
             'Unable to load HF model. ' +
diff --git a/scripts/misc/download_hf_model.py b/scripts/misc/download_hf_model.py
new file mode 100644
index 0000000000..58c3445e7d
--- /dev/null
+++ b/scripts/misc/download_hf_model.py
@@ -0,0 +1,83 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Script to download model weights from Hugging Face Hub or a cache server."""
+import argparse
+import logging
+import os
+import sys
+
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+
+from llmfoundry.utils.model_download_utils import (download_from_cache_server,
+                                                   download_from_hf_hub)
+
+HF_TOKEN_ENV_VAR = 'HUGGING_FACE_HUB_TOKEN'
+
+logging.basicConfig(format=f'%(asctime)s: %(levelname)s: %(name)s: %(message)s',
+                    level=logging.INFO)
+log = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('--model', type=str, required=True)
+    argparser.add_argument('--download-from',
+                           type=str,
+                           choices=['hf', 'cache'],
+                           default='hf')
+    argparser.add_argument('--token',
+                           type=str,
+                           default=os.getenv(HF_TOKEN_ENV_VAR))
+    argparser.add_argument('--save-dir',
+                           type=str,
+                           default=HUGGINGFACE_HUB_CACHE)
+    argparser.add_argument('--cache-url', type=str, default=None)
+    argparser.add_argument('--ignore-cert', action='store_true', default=False)
+    argparser.add_argument(
+        '--fallback',
+        action='store_true',
+        default=True,
+        help=
+        'Whether to fallback to downloading from Hugging Face if download from cache fails',
+    )
+
+    args = argparser.parse_args(sys.argv[1:])
+    if args.download_from == 'hf':
+        download_from_hf_hub(args.model,
+                             save_dir=args.save_dir,
+                             token=args.token)
+    else:
+        try:
+            download_from_cache_server(
+                args.model,
+                args.cache_url,
+                args.save_dir,
+                token=args.token,
+                ignore_cert=args.ignore_cert,
+            )
+
+            # A little hacky: run the Hugging Face download just to repair the symlinks in the HF cache file structure.
+            # This shouldn't actually download any files if the cache server download was successful, but should address
+            # a non-deterministic bug where the symlinks aren't repaired properly by the time the model is initialized.
+            log.info('Repairing Hugging Face cache symlinks')
+
+            # Hide some noisy logs that aren't important for just the symlink repair.
+            old_level = logging.getLogger().level
+            logging.getLogger().setLevel(logging.ERROR)
+            download_from_hf_hub(args.model,
+                                 save_dir=args.save_dir,
+                                 token=args.token)
+            logging.getLogger().setLevel(old_level)
+
+        except PermissionError:
+            log.error(f'Not authorized to download {args.model}.')
+        except Exception as e:
+            if args.fallback:
+                log.warning(
+                    f'Failed to download {args.model} from cache server. Falling back to Hugging Face Hub. Error: {e}'
+                )
+                download_from_hf_hub(args.model,
+                                     save_dir=args.save_dir,
+                                     token=args.token)
+            else:
+                raise e
diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py
new file mode 100644
index 0000000000..51841d669e
--- /dev/null
+++ b/scripts/misc/profile_packing.py
@@ -0,0 +1,100 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Script to profile example packing."""
+import os
+from typing import Dict
+
+from llmfoundry.data.packing import profile_packing
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser, Namespace
+
+    from omegaconf import OmegaConf as om
+
+    from llmfoundry.utils import build_tokenizer
+
+    def parse_args() -> Namespace:
+        """Parse commandline arguments."""
+        parser = ArgumentParser(
+            description=
+            'Profile packing_ratio choices for a particular workload.')
+        parser.add_argument(
+            '--yaml-path',
+            type=str,
+            required=True,
+            help='Path to the YAML that defines the workload to profile.')
+        parser.add_argument('--num-devices',
+                            type=int,
+                            default=None,
+                            help='How many devices your run will use.')
+        parser.add_argument('--min',
+                            type=float,
+                            required=True,
+                            help='Smallest packing_ratio to test. Must be >=1.')
+        parser.add_argument(
+            '--max',
+            type=float,
+            required=True,
+            help='Largest packing_ratio to test. Must be larger than `min`.')
+        parser.add_argument(
+            '--num-packing-ratios',
+            type=int,
+            default=20,
+            help=
+            'Number of packing_ratio values (spaced between `min` and `max) to try.'
+        )
+
+        args = parser.parse_args()
+
+        if not os.path.isfile(args.yaml_path):
+            raise FileNotFoundError(
+                '`yaml_path` does not correspond to any existing file.')
+        if args.num_devices < 1:
+            raise ValueError('`num_devices` must be a positive integer.')
+        if args.min < 1.0:
+            raise ValueError('`min` must be >=1.0.')
+        if args.max < args.min:
+            raise ValueError('`max` cannot be less than `min`.')
+        if args.num_packing_ratios < 1:
+            raise ValueError('`num_packing_ratios` must be a positive integer.')
+        return args
+
+    args = parse_args()
+
+    with open(args.yaml_path) as f:
+        cfg = om.load(f)
+    if 'parameters' in cfg:
+        cfg = om.to_container(cfg.parameters)
+        cfg = om.create(cfg)
+    device_batch_size = cfg.global_train_batch_size // args.num_devices
+
+    # Fetch a bunch of raw examples once, which we'll re-use
+    if 'train_loader' not in cfg:
+        raise ValueError('config must define train_loader')
+    dataloader_cfg = cfg.train_loader
+
+    # build tokenizer
+    if 'tokenizer' not in cfg:
+        raise ValueError('config must define tokenizer')
+
+    resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True)
+    if not isinstance(resolved_tokenizer_cfg, Dict):
+        raise ValueError(
+            'tokenizer config needs to be resolved by omegaconf into a Dict.')
+    tokenizer_cfg = resolved_tokenizer_cfg
+
+    tokenizer_name = tokenizer_cfg['name']
+    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+
+    results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max,
+                              args.num_packing_ratios, device_batch_size)
+
+    header = '\n\n\n packing_ratio | % PADDING | % WASTE'
+    fstr = '        {:5.1f}  |  {:5.2f}%   | {:6.2f}%'
+
+    print(header)
+    print('-' * len(header))
+    for packing_ratio, padding, waste in results:
+        print(fstr.format(packing_ratio, padding, waste))
diff --git a/scripts/train/README.md b/scripts/train/README.md
index f10fdf59f0..4c706dc040 100644
--- a/scripts/train/README.md
+++ b/scripts/train/README.md
@@ -5,14 +5,15 @@ This README walks through pretraining and finetuning a large language model usin
 #### Table of Contents
 1. [Part 1: LLM Pretraining](#llmpretraining)
    1. [Installation](#installation)
-   2. [Dataset Preparation](#datasetpreparation)
-   3. [How to start single and multi-node pretraining](#howtostartpretraining)
-2. [Part 2: LLM Finetuning](#llmfinetuning)
+   1. [Dataset Preparation](#datasetpreparation)
+   1. [How to start single and multi-node pretraining](#howtostartpretraining)
+1. [Part 2: LLM Finetuning](#llmfinetuning)
    1. [Using a dataset on the HuggingFace Hub](#hfdataset)
-   2. [Using a local dataset](#localdataset)
-   3. [Using a StreamingDataset (MDS) formatted dataset locally or in an object store](#mdsdataset)
-3. [FAQ: How many GPUs do I need to train a LLM?](#howmandygpus)
-4. [FAQ: Optimizing Performance](#optimizingperformance)
+   1. [Using a local dataset](#localdataset)
+   1. [Using a StreamingDataset (MDS) formatted dataset locally or in an object store](#mdsdataset)
+1. [Using Flash Attention](#flashattention)
+1. [FAQ: How many GPUs do I need to train a LLM?](#howmandygpus)
+1. [FAQ: Optimizing Performance](#optimizingperformance)
 
 # Part 1: LLM Pretraining <a name="llmpretraining"></a>
 
@@ -332,6 +333,53 @@ train_loader:
         ...
 ```
 
+# Using Flash Attention <a name="flashattention"></a>
+
+Flash Attention is an optimized implementation of the attention mechanism, first introduced by [Dao et al.](https://github.com/Dao-AILab/flash-attention). There are three versions of Flash Attention that can be used with LLM Foundry: Flash Attention V1, Flash Attention V2, and a Triton implementation of Flash Attention. To start, we recommend using one of our [provided Docker images](../../README.md#mosaicml-docker-images) corresponding to the Flash Attention version you would like to use. The Triton implementation can be used with either Flash Attention V1 or V2. Next, how you specify to use Flash Attention depends on which model you are using.
+
+For MPT, you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: mpt_causal_lm
+    ...
+    attn_config:
+        # Will use either V1 or V2 depending on what is installed
+        # "triton" will use the Triton implementation
+        attn_impl: flash
+        ...
+```
+
+If loading MPT from the HuggingFace Hub, you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    pretrained_model_name_or_path: mosaicml/mpt-7b
+    ...
+    config_overrides:
+        # Will use either V1 or V2 depending on what is installed
+        # "triton" will use the Triton implementation
+        attn_config:
+            attn_impl: flash
+        ...
+```
+
+For any HuggingFace model that supports Flash Attention (e.g. Llama and Mistral), you can specify Flash Attention in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    use_flash_attention_2: True # Will be automatically set to True if Flash Attention V2 is installed and the model supports it
+    ...
+```
+HuggingFace models currently only support Flash Attention V2.
+
+For Llama specifically, we have another option if you would like to use the Triton implementation of Flash Attention. You can specify this in your YAML like so:
+```yaml
+model:
+    name: hf_causal_lm
+    pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
+    attention_patch_type: triton
+    ...
+```
 
 # FAQ: How many GPUs do I need to train a LLM? <a name="howmanygpus"></a>
 This is a complicated question in general, but if we assume that you are using FSDP with `FULL_SHARD`,
diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
index 2c3fb11496..ed2e9fcac0 100644
--- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
+++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
@@ -41,7 +41,10 @@ train_loader:
     shuffle: true
     max_seq_len: ${max_seq_len}
     decoder_only_format: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 28ecb68e34..88f776375f 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 import copy
+import gc
 import logging
 import os
 import sys
@@ -23,9 +24,8 @@
 from transformers import PreTrainedTokenizerBase
 
 from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM,
-                        MPTForCausalLM, build_finetuning_dataloader,
-                        build_text_denoising_dataloader)
-from llmfoundry.data.text_data import build_text_dataloader
+                        MPTForCausalLM)
+from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.utils.builders import (build_algorithm, build_callback,
                                        build_icl_data_and_gauntlet,
                                        build_logger, build_optimizer,
@@ -168,30 +168,6 @@ def print_trainable_parameters(model: torch.nn.Module) -> None:
     )
 
 
-def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
-                     device_batch_size: int):
-    if cfg.name == 'text':
-        return build_text_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    elif cfg.name == 'text_denoising':
-        return build_text_denoising_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    elif cfg.name == 'finetuning':
-        return build_finetuning_dataloader(
-            cfg,
-            tokenizer,
-            device_batch_size,
-        )
-    else:
-        raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
-
-
 def main(cfg: DictConfig) -> Trainer:
     # Filter deprecation warning from torch internal usage
     warnings.filterwarnings(
@@ -216,6 +192,12 @@ def main(cfg: DictConfig) -> Trainer:
         os.environ[
             'PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_split_size_mb}'
 
+    # Set CUDA lazy loading
+    # This can save a bit of memory if not all modules are needed
+    cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False)
+    if cuda_load_lazy:
+        os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
+
     # Set seed first
     seed: int = pop_config(cfg, 'seed', must_exist=True)
     reproducibility.seed_all(seed)
@@ -401,6 +383,12 @@ def main(cfg: DictConfig) -> Trainer:
                                                           'compile_config',
                                                           must_exist=False,
                                                           default_value=None)
+    metadata: Optional[Dict[str, str]] = pop_config(cfg,
+                                                    'metadata',
+                                                    must_exist=False,
+                                                    default_value=None,
+                                                    convert=True)
+
     # Enable autoresume from model checkpoints if possible
     autoresume_default: bool = False
     if logged_cfg.get('run_name', None) is not None \
@@ -478,6 +466,14 @@ def main(cfg: DictConfig) -> Trainer:
             mosaicml_logger = MosaicMLLogger()
             loggers.append(mosaicml_logger)
 
+    if metadata is not None:
+        # Flatten the metadata for logging
+        logged_cfg.pop('metadata', None)
+        logged_cfg.update(metadata, merge=True)
+        if mosaicml_logger is not None:
+            mosaicml_logger.log_metrics(metadata)
+            mosaicml_logger._flush_metadata(force_flush=True)
+
     # Profiling
     profiler: Optional[Profiler] = None
     profiler_cfg: Optional[DictConfig] = pop_config(cfg,
@@ -634,6 +630,7 @@ def main(cfg: DictConfig) -> Trainer:
     print('Logging config')
     log_config(logged_cfg)
     torch.cuda.empty_cache()
+    gc.collect()
 
     # Eval first if requested
     if eval_first and trainer.state.timestamp.batch.value == 0:
diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
index 45dca2f1e0..d6f72b0c8e 100644
--- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml
+++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
@@ -49,7 +49,10 @@ train_loader: &train_loader
     allow_pad_trimming: false
     decoder_only_format: true
     shuffle: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
index 6483dd31f5..c5813235d9 100644
--- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
@@ -41,7 +41,10 @@ train_loader:
     allow_pad_trimming: false
     decoder_only_format: true
     shuffle: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
index 9686317bef..2f23d8e55a 100644
--- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
@@ -31,7 +31,10 @@ train_loader:
     max_seq_len: ${max_seq_len}
     allow_pad_trimming: false
     decoder_only_format: true
-    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+    # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+    # # of the dataset.
+    # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
     # # to profile this run's optimal packing_ratio as it depends on GPU count,
     # # batch size, sequence length
     # packing_ratio:
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
index cfb447e2e4..12914e14bc 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
index fc1e3b0b7f..3da239c717 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/gpt2-small.yaml b/scripts/train/yamls/pretrain/gpt2-small.yaml
index dde59d55b1..d40cff6e9e 100644
--- a/scripts/train/yamls/pretrain/gpt2-small.yaml
+++ b/scripts/train/yamls/pretrain/gpt2-small.yaml
@@ -34,7 +34,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -47,7 +46,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/scripts/train/yamls/pretrain/opt-3b.yaml b/scripts/train/yamls/pretrain/opt-3b.yaml
index 3ac281f0ea..4423784b54 100644
--- a/scripts/train/yamls/pretrain/opt-3b.yaml
+++ b/scripts/train/yamls/pretrain/opt-3b.yaml
@@ -27,7 +27,6 @@ train_loader:
     remote: ${data_remote}
     split: train
     shuffle: true
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: true
@@ -40,7 +39,6 @@ eval_loader:
     remote: ${data_remote}
     split: val
     shuffle: false
-    tokenizer_name: ${tokenizer_name}
     max_seq_len: ${max_seq_len}
     shuffle_seed: ${global_seed}
   drop_last: false
diff --git a/setup.py b/setup.py
index d0ecc66160..afdfce8d48 100644
--- a/setup.py
+++ b/setup.py
@@ -47,10 +47,10 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17',
+    'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17,<0.18',
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
-    'transformers>=4.33,<4.34',
-    'mosaicml-streaming>=0.6,<0.7',
+    'transformers>=4.34.1,<4.35',
+    'mosaicml-streaming>=0.7.1,<0.8',
     'torch>=1.13.1,<2.1.1',
     'datasets>=2.14.5,<2.15',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
@@ -66,6 +66,8 @@
     'triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python',
     'boto3>=1.21.45,<2',
     'huggingface-hub>=0.17.0,<1.0',
+    'beautifulsoup4>=4.12.2,<5',  # required for model download utils
+    'tenacity>=8.2.3,<9',
 ]
 
 extra_deps = {}
@@ -81,8 +83,12 @@
     'hf_transfer==0.1.3',
 ]
 
+extra_deps['databricks'] = [
+    'mosaicml[databricks]>=0.17,<0.18',
+]
+
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.16.1,<0.17',
+    'mosaicml[tensorboard]>=0.17,<0.18',
 ]
 
 extra_deps['gpu'] = [
@@ -101,7 +107,8 @@
 extra_deps['peft'] = [
     'loralib==0.1.1',  # lora core
     'bitsandbytes==0.39.1',  # 8bit
-    'scipy>=1.10.0,<=1.11.0',  # bitsandbytes dependency; TODO: eliminate when incorporated to bitsandbytes
+    # bitsandbytes dependency; TODO: eliminate when incorporated to bitsandbytes
+    'scipy>=1.10.0,<=1.11.0',
     # TODO: pin peft when it stabilizes.
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
     'peft==0.4.0',
@@ -114,9 +121,10 @@
 extra_deps['all-cpu'] = set(
     dep for key, deps in extra_deps.items() for dep in deps if 'gpu' not in key)
 extra_deps['all'] = set(dep for key, deps in extra_deps.items() for dep in deps
-                        if key != 'gpu-flash2')
-extra_deps['all-flash2'] = set(
-    dep for key, deps in extra_deps.items() for dep in deps if key != 'gpu')
+                        if key not in {'gpu-flash2', 'all-cpu'})
+extra_deps['all-flash2'] = set(dep for key, deps in extra_deps.items()
+                               for dep in deps
+                               if key not in {'gpu', 'all', 'all-cpu'})
 
 setup(
     name=_PACKAGE_NAME,
diff --git a/tests/conftest.py b/tests/conftest.py
index b39ebd66a9..545dc7e38f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,10 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import gc
 import os
 from typing import List, Optional
 
 import pytest
-import torch
 from composer.utils import reproducibility
 
 # Allowed options for pytest.mark.world_size()
@@ -18,6 +16,13 @@
 # Enforce deterministic mode before any tests start.
 reproducibility.configure_deterministic_mode()
 
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    'tests.fixtures.autouse',
+    'tests.fixtures.models',
+    'tests.fixtures.data',
+]
+
 
 def _add_option(parser: pytest.Parser,
                 name: str,
@@ -78,12 +83,3 @@ def pytest_collection_modifyitems(config: pytest.Config,
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
     if exitstatus == 5:
         session.exitstatus = 0  # Ignore no-test-ran errors
-
-
-@pytest.fixture(autouse=True)
-def clear_cuda_cache(request: pytest.FixtureRequest):
-    """Clear memory between GPU tests."""
-    marker = request.node.get_closest_marker('gpu')
-    if marker is not None and torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/fixtures/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
new file mode 100644
index 0000000000..c51ccfacb0
--- /dev/null
+++ b/tests/fixtures/autouse.py
@@ -0,0 +1,39 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import pytest
+import torch
+from composer.utils import dist, get_device, reproducibility
+
+
+@pytest.fixture(autouse=True)
+def initialize_dist(request: pytest.FixtureRequest):
+    """Initialize the default PyTorch distributed process group for tests."""
+    # should we just always initialize dist like in train.py?
+    _default = pytest.mark.world_size(1).mark
+    world_size = request.node.get_closest_marker('world_size', _default).args[0]
+    gpu = request.node.get_closest_marker('gpu')
+    if world_size > 1:
+        dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu'))
+
+
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+
+
+@pytest.fixture
+def random_seed() -> int:
+    return 17
+
+
+@pytest.fixture(autouse=True)
+def seed_all(random_seed: int):
+    """Sets the seed for reproducibility."""
+    reproducibility.seed_all(random_seed)
diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
new file mode 100644
index 0000000000..16dd01347d
--- /dev/null
+++ b/tests/fixtures/data.py
@@ -0,0 +1,60 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from composer.utils import dist
+from omegaconf import DictConfig
+from pytest import fixture
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from tests.data_utils import make_tiny_ft_dataset
+
+
+@fixture
+def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path:
+    """Creates a tiny dataset and returns the path."""
+    tiny_dataset_path = tmp_path / 'test-ift-data-small'
+    tiny_dataset_path.mkdir(exist_ok=True)
+    tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
+    if dist.get_world_size() == 1 or dist.get_global_rank() == 0:
+        make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
+    return tiny_dataset_path
+
+
+@fixture
+@patch('os.cpu_count', MagicMock(return_value=None))
+def tiny_ft_dataloader(tiny_ft_dataset_path: Path,
+                       mpt_tokenizer: PreTrainedTokenizerBase,
+                       max_seq_len: int = 128,
+                       device_batch_size: int = 1) -> DataLoader:
+    dataloader_cfg = DictConfig({
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': str(tiny_ft_dataset_path),
+            'split': 'train',
+            'max_seq_len': max_seq_len,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': None,
+            'shuffle': True,
+        },
+        'drop_last': False,
+        'num_workers': 4,
+        'pin_memory': False,
+        'prefetch_factor': 2,
+        'persistent_workers': False,
+        'timeout': 0
+    })
+
+    dataloader = build_finetuning_dataloader(
+        dataloader_cfg,
+        mpt_tokenizer,
+        device_batch_size,
+    ).dataloader
+
+    assert isinstance(dataloader, DataLoader)
+    return dataloader
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
new file mode 100644
index 0000000000..1b1ef86302
--- /dev/null
+++ b/tests/fixtures/models.py
@@ -0,0 +1,70 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable
+
+from omegaconf import DictConfig
+from pytest import fixture
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
+from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
+    model = COMPOSER_MODEL_REGISTRY[config.name](config, tokenizer)
+    return model
+
+
+@fixture
+def mpt_tokenizer():
+    return build_tokenizer('EleutherAI/gpt-neox-20b', {})
+
+
+@fixture
+def build_tiny_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerMPTCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerMPTCausalLM:
+        config = DictConfig({
+            'name': 'mpt_causal_lm',
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        })
+        config.update(kwargs)
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerMPTCausalLM)
+        return model
+
+    return build
+
+
+@fixture
+def build_tiny_hf_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerHFCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerHFCausalLM:
+        config_overrides = {
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        }
+        config_overrides.update(kwargs)
+        config = DictConfig({
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
+            'pretrained': False,
+            'config_overrides': config_overrides,
+        })
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerHFCausalLM)
+        return model
+
+    return build
diff --git a/tests/test_builders.py b/tests/test_builders.py
index 0d24d2154f..237e27b52b 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -6,8 +6,10 @@
 
 import pytest
 from composer.callbacks import Generate
+from omegaconf import OmegaConf as om
 from transformers import PreTrainedTokenizerBase
 
+from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
 from llmfoundry.utils.builders import build_callback, build_tokenizer
 
@@ -78,3 +80,33 @@ def test_build_generate_callback_unspecified_interval():
                 'foo': 'bar',
                 'something': 'else',
             })
+
+
+def test_build_hf_checkpointer_callback():
+    with mock.patch.object(HuggingFaceCheckpointer,
+                           '__init__') as mock_hf_checkpointer:
+        mock_hf_checkpointer.return_value = None
+        save_folder = 'path_to_save_folder'
+        save_interval = 1
+        mlflow_logging_config_dict = {
+            'metadata': {
+                'databricks_model_family': 'MptForCausalLM',
+                'databricks_model_size_parameters': '7b',
+                'databricks_model_source': 'mosaic-fine-tuning',
+                'task': 'llm/v1/completions'
+            }
+        }
+        build_callback(name='hf_checkpointer',
+                       kwargs=om.create({
+                           'save_folder': save_folder,
+                           'save_interval': save_interval,
+                           'mlflow_logging_config': mlflow_logging_config_dict
+                       }))
+
+        assert mock_hf_checkpointer.call_count == 1
+        _, _, kwargs = mock_hf_checkpointer.mock_calls[0]
+        assert kwargs['save_folder'] == save_folder
+        assert kwargs['save_interval'] == save_interval
+        assert isinstance(kwargs['mlflow_logging_config'], dict)
+        assert isinstance(kwargs['mlflow_logging_config']['metadata'], dict)
+        assert kwargs['mlflow_logging_config'] == mlflow_logging_config_dict
diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py
index 4c555ea9a2..4fe5ed7e64 100644
--- a/tests/test_data_prep_scripts.py
+++ b/tests/test_data_prep_scripts.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import shutil
 import sys
 from argparse import Namespace
+from pathlib import Path
 
 # Add repo root to path so we can import scripts and test it
 repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -13,17 +13,16 @@
 from scripts.data_prep.convert_dataset_json import main as main_json
 
 
-def test_download_script_from_api():
+def test_download_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-c4-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-c4-1')
     main_hf(
         Namespace(
             **{
                 'dataset': 'c4',
                 'data_subset': 'en',
                 'splits': ['val_xsmall'],
-                'out_root': './my-copy-c4-1',
+                'out_root': path,
                 'compression': None,
                 'concat_tokens': None,
                 'bos_text': None,
@@ -32,18 +31,16 @@ def test_download_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
 
 
-def test_json_script_from_api():
+def test_json_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-arxiv-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
     main_json(
         Namespace(
             **{
                 'path': 'scripts/data_prep/example_data/arxiv.jsonl',
-                'out_root': './my-copy-arxiv-1',
+                'out_root': path,
                 'compression': None,
                 'split': 'train',
                 'concat_tokens': None,
@@ -53,4 +50,3 @@ def test_json_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
index 656b6d52a6..2080ec32ec 100644
--- a/tests/test_dataloader.py
+++ b/tests/test_dataloader.py
@@ -8,7 +8,7 @@
 import sys
 import tempfile
 from argparse import Namespace
-from typing import Optional
+from typing import Literal, Optional, Union
 from unittest.mock import MagicMock
 
 import pytest
@@ -248,10 +248,11 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool,
 
 @pytest.mark.parametrize('decoder_only_format', [True, False])
 @pytest.mark.parametrize('allow_pad_trimming', [True, False])
-@pytest.mark.parametrize('packing_ratio', [10.0, None])
+@pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto'])
 def test_finetuning_dataloader(decoder_only_format: bool,
                                allow_pad_trimming: bool,
-                               packing_ratio: Optional[float]):
+                               packing_ratio: Optional[Union[float,
+                                                             Literal['auto']]]):
     # Use the datasets just built in the last test
     tokenizer_name = 'gpt2' if decoder_only_format else 't5-base'
     max_seq_len = 2048 if decoder_only_format else 1024
diff --git a/tests/test_eval.py b/tests/test_eval.py
index ecd15ab62f..1217487b70 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -62,7 +62,7 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any):
     assert isinstance(test_cfg, om.DictConfig)
     main(test_cfg)
     out, _ = capfd.readouterr()
-    expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt '
+    expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt |'
     assert expected_results in out
-    expected_results = '| model_name   |   average |   language_understanding_lite |\n|:-------------|----------:|------------------------------:|\n| tiny_mpt     |         0 |                             0 |'
+    expected_results = '| model_name   |   default_average |   language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt     |                 0 |                             0 |'
     assert expected_results in out
diff --git a/tests/test_eval_gauntlet.py b/tests/test_eval_gauntlet.py
index 8ccdd75766..3a1e371ab8 100644
--- a/tests/test_eval_gauntlet.py
+++ b/tests/test_eval_gauntlet.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import omegaconf as om
 import pytest
@@ -53,7 +53,10 @@ def log_metrics(self, metrics: Dict[str, float]) -> None:
         self.inmemorylogger.log_metrics(metrics)
 
 
-def test_gauntlet_callback():
+@pytest.mark.parametrize('averages', [{
+    'core_average': ['world_knowledge', 'language_understanding']
+}, None])
+def test_gauntlet_callback(averages: Optional[dict]):
     icl_task_config = om.OmegaConf.create("""
             - label: jeopardy_small
               dataset_uri: eval/local_data/world_knowledge/jeopardy_small.jsonl # ADD YOUR OWN DATASET URI
@@ -87,6 +90,9 @@ def test_gauntlet_callback():
           """)
     assert isinstance(eval_gauntlet_config, om.DictConfig) or isinstance(
         eval_gauntlet_config, str)
+
+    if averages is not None:
+        eval_gauntlet_config.averages = averages
     tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
 
     # test loading functionality
@@ -106,4 +112,9 @@ def test_gauntlet_callback():
         name = f'icl/metrics/eval_gauntlet/{category}'
         assert result[name] == pytest.approx(0.25)
 
-    assert result['icl/metrics/eval_gauntlet/average'] == pytest.approx(0.25)
+    if averages is None:
+        assert result[
+            'icl/metrics/eval_gauntlet/default_average'] == pytest.approx(0.25)
+    else:
+        assert result[
+            'icl/metrics/eval_gauntlet/core_average'] == pytest.approx(0.25)
diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index 145d4a5885..1ede36c0b5 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -3,9 +3,11 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
+
 
 def allclose_helper(t0: torch.Tensor,
                     t1: torch.Tensor,
@@ -19,7 +21,32 @@ def allclose_helper(t0: torch.Tensor,
 @pytest.mark.parametrize('attn_impl_1', ['flash', 'triton', 'torch'])
 @pytest.mark.parametrize('clip_qkv', [True, False])
 @pytest.mark.parametrize('qk_ln', [True, False])
-@pytest.mark.parametrize('alibi', [True, False])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
 @pytest.mark.parametrize(
     'attn_type',
     ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
@@ -27,23 +54,27 @@ def test_attn_impl(attn_impl_0: str,
                    attn_impl_1: str,
                    clip_qkv: bool,
                    qk_ln: bool,
-                   alibi: bool,
+                   pos_emb_config: dict,
                    attn_type: str,
                    device: str = 'cuda'):
     """Compare all attn impl with each other.
 
-    Includes testing with and without attn_clip_qkv, attn_qk_ln, and alibi.
+    Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and
+    rope.
     """
     from llmfoundry.models.layers import attention
-
+    alibi = pos_emb_config['alibi']
+    rope = pos_emb_config['rope']
     if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
         pytest.xfail('flash attn does not support alibi')
 
-    reproducibility.seed_all(7)
+    if rope and (pos_emb_config['rope_impl']
+                 == 'dail') and (not is_flash_v2_installed()):
+        pytest.skip('dail implementation of rope requires flash attention 2.')
 
     cfg = om.create({
         'attn_impl': 'flash',
-        'd_model': 128,
+        'd_model': 64,
         'n_heads': 4,
         'attn_pdrop': 0,
         'clip_qkv': clip_qkv,
@@ -51,12 +82,13 @@ def test_attn_impl(attn_impl_0: str,
     })
 
     n, s, f = 2, 16, cfg.d_model
-
+    assert cfg.d_model % cfg.n_heads == 0
     if attn_type == 'grouped_query_attention':
         cfg.kv_n_heads = 2
 
     cfg.attn_impl = attn_impl_0
     attn0 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+    cfg.attn_impl = attn_impl_1
     attn1 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
 
     attn1.load_state_dict(attn0.state_dict())
@@ -94,16 +126,45 @@ def gen_bias(attn_impl: str):
 
     with torch.autocast(x0.device.type):
         attn_bias = gen_bias(attn0.attn_impl)
+
+        rotary_emb_w_meta_info = None
+        if rope:
+            rotary_embedding = gen_rotary_embedding(
+                rope_head_dim=cfg.d_model // cfg.n_heads,
+                rope_impl=pos_emb_config['rope_impl'],
+                rope_theta=pos_emb_config['rope_theta'],
+                rope_dail_config=pos_emb_config.get('rope_dail_config', {}),
+                rope_hf_config=pos_emb_config.get('rope_hf_config', {}),
+                max_seq_len=s).to(device)
+            pos = torch.arange(s).unsqueeze(0).to(device=device)
+            # adjust the position indices to account for padding tokens
+            pos = torch.clamp(
+                pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1),
+                min=0,
+            )
+            rotary_emb_w_meta_info = {
+                'impl':
+                    pos_emb_config['rope_impl'],
+                'rotary_emb':
+                    rotary_embedding,
+                'offset_info':
+                    pos if (pos_emb_config['rope_impl'] == 'hf') else 0,
+                'seq_len':
+                    s,
+            }
+
         y0, _, _ = attn0(x0,
                          past_key_value=None,
                          attn_bias=attn_bias,
                          attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                          is_causal=True)
         attn_bias = gen_bias(attn1.attn_impl)
         y1, _, _ = attn1(x1,
                          past_key_value=None,
                          attn_bias=attn_bias,
                          attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=rotary_emb_w_meta_info,
                          is_causal=True)
         y0 *= attention_mask.unsqueeze(-1)
         y1 *= attention_mask.unsqueeze(-1)
@@ -122,7 +183,15 @@ def gen_bias(attn_impl: str):
         assert p.grad is not None
         assert tp.grad is not None
         assert allclose_helper(p, tp)
-        assert allclose_helper(p.grad, tp.grad)
+
+        using_hf_rope = pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'hf'
+
+        # special case that (likely) fails due to numerics
+        if clip_qkv and qk_ln and using_hf_rope and attn_type == 'grouped_query_attention':
+            assert allclose_helper(p.grad, tp.grad, atol=2.e-2, rtol=2.e-2)
+        else:
+            assert allclose_helper(p.grad, tp.grad)
 
     assert x0.grad is not None
     assert x1.grad is not None
@@ -135,8 +204,6 @@ def test_vs_mha(attn_impl: str, device: str = 'cuda'):
     """Compare diff attn_impl to torch.nn.MultiheadAttention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -234,8 +301,6 @@ def test_grouped_attention_heads(attn_impl: str,
     """Ensure grouped_query_attention runs w/ diff n_heads & kv_n_heads."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -273,8 +338,6 @@ def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
     """Check indivisble combinations of grouped_query_attention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
diff --git a/tests/test_fsdp_act_checkpoint.py b/tests/test_fsdp_act_checkpoint.py
new file mode 100644
index 0000000000..1a46fcbccd
--- /dev/null
+++ b/tests/test_fsdp_act_checkpoint.py
@@ -0,0 +1,73 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from composer import Trainer
+from composer.utils import get_device
+from omegaconf import OmegaConf as om
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \
+    CheckpointWrapper
+
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+@pytest.mark.parametrize('activation_checkpointing', [True, False])
+@pytest.mark.parametrize(
+    'activation_checkpointing_target',
+    [[], ['grouped_query_attention'], ['mptblock', 'grouped_query_attention']])
+def test_fsdp_act_checkpoint(activation_checkpointing: bool,
+                             activation_checkpointing_target: list):
+    device = get_device('gpu')
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 2,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_type': 'grouped_query_attention',
+            'kv_n_heads': 2,
+        },
+        'activation_checkpointing_target': activation_checkpointing_target
+    }
+    model_cfg = om.create(model_cfg)
+
+    fsdp_config = {
+        'activation_checkpointing': activation_checkpointing,
+        'activation_checkpointing_reentrant': False,
+        'activation_cpu_offload': False,
+    }
+
+    model = ComposerMPTCausalLM(model_cfg)
+    model = device.module_to_device(model)
+
+    trainer = Trainer(
+        model=model,
+        device='gpu',
+        fsdp_config=fsdp_config,
+    )
+
+    assert trainer.state.fsdp_enabled
+    if not activation_checkpointing:
+        assert not isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0], CheckpointWrapper)
+    elif (not activation_checkpointing_target
+         ) or activation_checkpointing_target == [
+             'mptblock', 'grouped_query_attention'
+         ]:
+        assert isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0]._fsdp_wrapped_module, CheckpointWrapper)
+    elif activation_checkpointing_target == ['grouped_query_attention']:
+        assert isinstance(
+            trainer.state.model.model._fsdp_wrapped_module.transformer.
+            blocks[0]._fsdp_wrapped_module.attn, CheckpointWrapper)
+    else:
+        raise ValueError(
+            f'Unknown activation_checkpointing_target: {activation_checkpointing_target}'
+        )
diff --git a/tests/test_hf_config.py b/tests/test_hf_config.py
index 5b3bb3d150..b47f267c55 100644
--- a/tests/test_hf_config.py
+++ b/tests/test_hf_config.py
@@ -9,7 +9,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from transformers import AutoModelForCausalLM
@@ -93,8 +92,6 @@ def test_hf_config_override(
     with open(conf_path) as f:
         test_cfg = om.load(f)
 
-    reproducibility.seed_all(test_cfg.seed)
-
     # Build Model
     # For fast initialization, use `meta` device
     print('Initializing model...')
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index e7787754de..dcb743b536 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -5,7 +5,7 @@
 import os
 import pathlib
 import sys
-from unittest.mock import MagicMock
+from unittest.mock import ANY, MagicMock, patch
 
 from composer import Trainer
 from composer.loggers import MLFlowLogger
@@ -138,6 +138,49 @@ def check_hf_tokenizer_equivalence(tokenizer1: PreTrainedTokenizerBase,
     tokenizer1.__dict__['init_kwargs'].pop('auto_map', None)
     tokenizer2.__dict__['init_kwargs'].pop('auto_map', None)
 
+    # Additional special tokens do not match between original tokenizer and loaded tokenizer due to transformers
+    # constructor differences
+    additional_special_tokens_1 = {
+        t if isinstance(t, str) else t.content
+        for t in tokenizer1.__dict__.pop('_additional_special_tokens', [])
+    }
+    additional_special_tokens_2 = {
+        t if isinstance(t, str) else t.content
+        for t in tokenizer2.__dict__.pop('_additional_special_tokens', [])
+    }
+    # Also pop it out of init_kwargs
+    tokenizer1.__dict__['init_kwargs'].pop('additional_special_tokens', None)
+    tokenizer2.__dict__['init_kwargs'].pop('additional_special_tokens', None)
+    tokenizer1.__dict__['init_kwargs'].pop('added_tokens_decoder', None)
+    tokenizer2.__dict__['init_kwargs'].pop('added_tokens_decoder', None)
+    # If the additional special tokens are the same (or a subset of each other), or if one of them is empty, then we are good
+    assert additional_special_tokens_1.issubset(
+        additional_special_tokens_2) or additional_special_tokens_2.issubset(
+            additional_special_tokens_1)
+
+    # The special token attributes may be strings or they may be AddedToken objects, so we just check string values
+    # First check that they have the same attrs
+    assert tokenizer1.SPECIAL_TOKENS_ATTRIBUTES == tokenizer2.SPECIAL_TOKENS_ATTRIBUTES
+    # Then check that the values are the same
+    for special_token_attr in tokenizer1.SPECIAL_TOKENS_ATTRIBUTES:
+        # Skip additional_special_tokens because we already checked it above
+        if special_token_attr == 'additional_special_tokens':
+            continue
+
+        # The init_kwargs can change between the original tokenizer and the loaded tokenizer,
+        # so we just pop them
+        tokenizer1.__dict__['init_kwargs'].pop(special_token_attr, None)
+        tokenizer2.__dict__['init_kwargs'].pop(special_token_attr, None)
+
+        attr1 = tokenizer1.__dict__.pop('_' + special_token_attr, None)
+        attr2 = tokenizer2.__dict__.pop('_' + special_token_attr, None)
+        if attr1 is None and attr2 is None:
+            continue
+
+        attr_value1 = attr1 if isinstance(attr1, str) else attr1.content
+        attr_value2 = attr2 if isinstance(attr2, str) else attr2.content
+        assert attr_value1 == attr_value2
+
     assert tokenizer1.__dict__ == tokenizer2.__dict__
 
 
@@ -199,34 +242,54 @@ def get_config(
     return cast(DictConfig, test_cfg)
 
 
-def test_callback_inits_with_defaults():
+def test_callback_inits():
+    # test with defaults
     _ = HuggingFaceCheckpointer(save_folder='test', save_interval='1ba')
 
+    # test default metatdata when mlflow registered name is given
+    hf_checkpointer = HuggingFaceCheckpointer(
+        save_folder='test',
+        save_interval='1ba',
+        mlflow_registered_model_name='test_model_name')
+    assert hf_checkpointer.mlflow_logging_config == {
+        'task': 'text-generation',
+        'metadata': {
+            'task': 'llm/v1/completions'
+        }
+    }
+
 
 @pytest.mark.world_size(2)
 @pytest.mark.gpu
-@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
+@pytest.mark.parametrize(
+    'model,tie_word_embeddings',
+    [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
+)
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
 @pytest.mark.parametrize('log_to_mlflow', [True, False])
-def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
-                                         fsdp_state_dict_type: Optional[str],
-                                         log_to_mlflow: bool):
+@pytest.mark.parametrize(
+    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
+    [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
+@patch('os.cpu_count', MagicMock(return_value=None))
+def test_huggingface_conversion_callback(
+        model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
+        fsdp_state_dict_type: Optional[str], log_to_mlflow: bool,
+        hf_save_interval: str, save_interval: str, max_duration: str,
+        expected_hf_checkpoints: int, expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
 
     max_seq_len = 16
-    save_interval_batches = 2
-    huggingface_save_interval_batches = 3
     device_batch_size = 1
     dataset_size = 14
-    max_duration_batches = 7
     precision_str = 'bfloat16'
     precision = torch.bfloat16
+    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
 
     checkpointer_callback = HuggingFaceCheckpointer(
         save_folder=os.path.join(tmp_path, 'checkpoints'),
-        save_interval=f'{huggingface_save_interval_batches}ba',
+        save_interval=hf_save_interval,
         precision=precision_str,
         mlflow_registered_model_name='dummy-registered-name'
         if log_to_mlflow else None,
@@ -249,9 +312,11 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                 'attn_impl': 'torch',
             },
             'loss_fn': 'torch_crossentropy',
+            'tie_word_embeddings': tie_word_embeddings,
         }
         tokenizer_name = 'EleutherAI/gpt-neox-20b'
     elif model == 'neo':
+        assert tie_word_embeddings is None
         model_cfg = {
             'name': 'hf_causal_lm',
             'pretrained_model_name_or_path': 'EleutherAI/gpt-neo-125M',
@@ -264,6 +329,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
         }
         tokenizer_name = 'EleutherAI/gpt-neo-125M'
     elif model == 'llama2':
+        assert tie_word_embeddings is None
         if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
             pytest.skip(
                 'The CI cluster does not have access to the Llama models, so skip this test.'
@@ -362,8 +428,8 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
         fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
-        save_interval=f'{save_interval_batches}ba',
-        max_duration=f'{max_duration_batches}ba',
+        save_interval=save_interval,
+        max_duration=max_duration,
         callbacks=[checkpointer_callback],
         loggers=[mlflow_logger_mock] if log_to_mlflow else [],
         optimizers=optimizer,
@@ -372,10 +438,18 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
     trainer.fit()
 
     if dist.get_global_rank() == 0:
-        assert mlflow_logger_mock.save_model.call_count == (1 if log_to_mlflow
-                                                            else 0)
-        assert mlflow_logger_mock.register_model.call_count == (
-            1 if log_to_mlflow else 0)
+        if log_to_mlflow:
+            assert mlflow_logger_mock.save_model.call_count == 1
+            mlflow_logger_mock.save_model.assert_called_with(
+                flavor='transformers',
+                transformers_model=ANY,
+                path=ANY,
+                task='text-generation',
+                metadata={'task': 'llm/v1/completions'})
+            assert mlflow_logger_mock.register_model.call_count == 1
+        else:
+            assert mlflow_logger_mock.save_model.call_count == 0
+            assert mlflow_logger_mock.register_model.call_count == 0
     else:
         assert mlflow_logger_mock.log_model.call_count == 0
         assert mlflow_logger_mock.register_model.call_count == 0
@@ -399,15 +473,13 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                 name for name in os.listdir(
                     os.path.join(tmp_path, 'checkpoints', 'huggingface'))
             ]
-            assert len(normal_checkpoints) == math.ceil(max_duration_batches /
-                                                        save_interval_batches)
-            assert len(huggingface_checkpoints) == math.ceil(
-                max_duration_batches / huggingface_save_interval_batches)
+            assert len(normal_checkpoints) == expected_normal_checkpoints
+            assert len(huggingface_checkpoints) == expected_hf_checkpoints
 
             # Load the last huggingface checkpoint
             loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
                 os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{max_duration_batches}'),
+                             f'ba{batches_per_epoch}'),
                 trust_remote_code=True,
             )
 
@@ -428,7 +500,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
 
             loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
                 os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{max_duration_batches}'),
+                             f'ba{batches_per_epoch}'),
                 trust_remote_code=True,
             )
 
@@ -442,19 +514,26 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
     delete_transformers_cache()
 
 
-@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
-def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
+@pytest.mark.parametrize(
+    'model,tie_word_embeddings',
+    [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
+)
+def test_convert_and_generate(model: str, tie_word_embeddings: bool,
+                              tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     om_cfg = None
     if model == 'mpt':
         om_cfg = get_config(
             conf_path='scripts/train/yamls/pretrain/testing.yaml')
+        om_cfg['tie_word_embeddings'] = tie_word_embeddings
     elif model == 'neo':
+        assert tie_word_embeddings is None
         om_cfg = get_config(
             conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
         om_cfg['model']['config_overrides']['hidden_size'] = 36
     elif model == 'llama2':
+        assert tie_word_embeddings is None
         if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
             pytest.skip(
                 'The CI cluster does not have access to the Llama models, so skip this test.'
@@ -483,6 +562,7 @@ def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -514,11 +594,14 @@ def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
 
 
 @pytest.mark.gpu
-def test_convert_and_generate_triton(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_convert_and_generate_triton(tie_word_embeddings: str,
+                                     tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     cfg = get_config()
     cfg['model']['init_device'] = 'cpu'
+    cfg['tie_word_embeddings'] = tie_word_embeddings
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'EleutherAI/gpt-neox-20b')
     model = ComposerMPTCausalLM(cfg['model'], tokenizer)
@@ -530,6 +613,7 @@ def test_convert_and_generate_triton(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -553,7 +637,9 @@ def test_convert_and_generate_triton(tmp_path: pathlib.Path):
     delete_transformers_cache()
 
 
-def test_convert_and_generate_meta(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_convert_and_generate_meta(tie_word_embeddings: str,
+                                   tmp_path: pathlib.Path):
     delete_transformers_cache()
 
     from composer.utils import dist
@@ -563,6 +649,7 @@ def test_convert_and_generate_meta(tmp_path: pathlib.Path):
     om_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml')
 
     om_cfg['model']['init_device'] = 'cpu'
+    om_cfg['tie_word_embeddings'] = tie_word_embeddings
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         om_cfg.tokenizer.name)
     original_model = COMPOSER_MODEL_REGISTRY[om_cfg['model'].name](
@@ -584,6 +671,7 @@ def test_convert_and_generate_meta(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
diff --git a/tests/test_hf_mpt_gen.py b/tests/test_hf_mpt_gen.py
index cc357141ba..ea133c64fa 100644
--- a/tests/test_hf_mpt_gen.py
+++ b/tests/test_hf_mpt_gen.py
@@ -1,167 +1,51 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from pathlib import Path
-from typing import Any, Dict
-from unittest.mock import Mock
+from typing import Callable
 
 import pytest
-from composer.callbacks import Generate as ComposerGenerate
 from composer.core.precision import get_precision_context
-from composer.trainer import Trainer
-from composer.utils import get_device, reproducibility
-from omegaconf import DictConfig
-from omegaconf import OmegaConf as om
+from composer.utils import get_device
+from transformers import PreTrainedTokenizerBase
 
-from llmfoundry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.data.finetuning import build_finetuning_dataloader
-from llmfoundry.utils import build_tokenizer
-from tests.data_utils import make_tiny_ft_dataset
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
 
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('device', ['cpu', 'gpu'])
 @pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
-def test_init_hfhub_mpt(device: str, attn_impl: str):
+def test_init_hfhub_mpt(
+    device: str,
+    attn_impl: str,
+    build_tiny_hf_mpt: Callable[..., ComposerHFCausalLM],
+    mpt_tokenizer: PreTrainedTokenizerBase,
+):
     if device == 'cpu' and attn_impl == 'triton':
         pytest.skip(f'{attn_impl=} not implemented for {device=}.')
     composer_device = get_device(device)
 
-    with open('scripts/train/yamls/pretrain/testing.yaml') as f:
-        test_cfg = om.load(f)
-
-    assert isinstance(test_cfg, DictConfig)
-    reproducibility.seed_all(test_cfg.get('seed', 42))
-
-    attn_uses_sequence_id = True if test_cfg.get('eos_token_id',
-                                                 None) is not None else False
-    test_cfg.model = DictConfig({
-        'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
-        'pretrained': False,
-        'config_overrides': {
-            'd_model': 128,
-            'n_heads': 4,
-            'n_layers': 2,
-            'expansion_ratio': 2,
-            'attn_config': {
-                'attn_impl': attn_impl,
-                'attn_uses_sequence_id': attn_uses_sequence_id,
-            },
-        },
+    model = build_tiny_hf_mpt(attn_config={
+        'attn_impl': attn_impl,
+        'attn_uses_sequence_id': False,
     })
-
-    # build tokenizer
-    tokenizer_cfg: Dict[str,
-                        Any] = om.to_container(test_cfg.tokenizer,
-                                               resolve=True)  # type: ignore
-    tokenizer_name = tokenizer_cfg['name']
-    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
-    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
-
-    # build model
-    model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model,
-                                                         tokenizer)
-    test_cfg.n_params = sum(p.numel() for p in model.parameters())
+    model = composer_device.module_to_device(model)
 
     model.eval()
-    model = composer_device.module_to_device(model)
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
         _ = model.generate(
             composer_device.tensor_to_device(
-                tokenizer('hello', return_tensors='pt')['input_ids']),
+                mpt_tokenizer('hello', return_tensors='pt')['input_ids']),
             max_new_tokens=10,
         )
 
 
-def test_init_hfhub_mpt_cpu():
-    test_init_hfhub_mpt(device='cpu', attn_impl='torch')
-
-
-@pytest.mark.gpu
-def test_mpt_generate_callback(tmpdir: Path):
-    composer_device = get_device('gpu')
-    reproducibility.seed_all(42)
-    max_seq_len = 128
-
-    # testing dataset and dataloader
-    dataset_size = 5
-
-    tiny_dataset_path = tmpdir / 'test-ift-data-small'
-    tiny_dataset_path.mkdir()
-    tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
-    make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
-
-    dataloader_cfg = DictConfig({
-        'name': 'finetuning',
-        'dataset': {
-            'hf_name': str(tiny_dataset_path),
-            'split': 'train',
-            'max_seq_len': max_seq_len,
-            'decoder_only_format': True,
-            'allow_pad_trimming': False,
-            'packing_ratio': None,
-            'shuffle': True,
-        },
-        'drop_last': False,
-        'num_workers': 4,
-        'pin_memory': False,
-        'prefetch_factor': 2,
-        'persistent_workers': False,
-        'timeout': 0
-    })
-
-    # build tokenizer
-    tokenizer = build_tokenizer('EleutherAI/gpt-neox-20b', {})
-
-    # build mpt model
-    model_config = DictConfig({
-        'name': 'mpt_causal_lm',
-        'config_overrides': {
-            'd_model': 128,
-            'n_heads': 4,
-            'n_layers': 2,
-            'expansion_ratio': 2,
-        },
-    })
-    model = COMPOSER_MODEL_REGISTRY[model_config.name](model_config, tokenizer)
-    model = composer_device.module_to_device(model)
-
-    # generate callback
-    prompts = [
-        'The best banana bread recipe is',
-        '2+2=',
-        'how much wood could a woodchuck chuck',
-    ]
-    gen_interval = 1
-    generate = ComposerGenerate(
-        prompts,
-        interval=f'{gen_interval}ba',
-        max_new_tokens=5,
-        batch_size=len(prompts),
-        use_cache=True,
-    )
-    generate.generate = Mock(wraps=generate.generate, autospec=True)
-
-    # build trainer
-    device_batch_size = 1
-    train_dataloader = build_finetuning_dataloader(
-        dataloader_cfg,
-        tokenizer,
-        device_batch_size,
-    )
-
-    trainer = Trainer(
-        model=model,
-        train_dataloader=train_dataloader,
-        device=composer_device,
-        max_duration=f'{gen_interval}ba',
-        callbacks=[generate],
-    )
-    trainer.logger.log_table = Mock()
-    trainer.fit()
-
-    generate.generate.assert_called_once()
-    trainer.logger.log_table.assert_called_once()
+def test_init_hfhub_mpt_cpu(
+    build_tiny_hf_mpt: Callable[..., ComposerHFCausalLM],
+    mpt_tokenizer: PreTrainedTokenizerBase,
+):
+    test_init_hfhub_mpt(device='cpu',
+                        attn_impl='torch',
+                        build_tiny_hf_mpt=build_tiny_hf_mpt,
+                        mpt_tokenizer=mpt_tokenizer)
diff --git a/tests/test_hf_v_mpt.py b/tests/test_hf_v_mpt.py
index 82e2d05550..46172faf35 100644
--- a/tests/test_hf_v_mpt.py
+++ b/tests/test_hf_v_mpt.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
@@ -52,10 +51,6 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     batch_size = 2  # set batch size
     device = 'cuda'  # set decive
 
-    # ensure reproducibility
-    seed = 17
-    reproducibility.seed_all(seed)  # set seed
-
     # get hf gpt2 cfg
     hf_cfg = om.create({
         'model': {
@@ -154,11 +149,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
 
     # UTIL: can be used to verify that models are not the same at init
     with torch.autocast(device_type='cuda', dtype=torch.float16):
-        torch.manual_seed(seed)
         hf_model_fwd = hf_model(batch)['logits']
         if kpm is not None:
             hf_model_fwd *= kpm
-        torch.manual_seed(seed)
         model_fwd = model(batch).logits
         if kpm is not None:
             model_fwd *= kpm
@@ -208,11 +201,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     model.load_state_dict(_hf_model_statedict)
 
     with torch.autocast(device_type=device, dtype=torch.float16):
-        torch.manual_seed(seed)
         hf_model_fwd = hf_model(batch)['logits']
         if kpm is not None:
             hf_model_fwd *= kpm
-        torch.manual_seed(seed)
         model_fwd = model(batch).logits
         if kpm is not None:
             model_fwd *= kpm
diff --git a/tests/test_huggingface_flash.py b/tests/test_huggingface_flash.py
new file mode 100644
index 0000000000..834488bb6a
--- /dev/null
+++ b/tests/test_huggingface_flash.py
@@ -0,0 +1,245 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+import os
+from unittest.mock import patch
+
+import pytest
+import torch
+import transformers
+from composer.core.precision import get_precision_context
+from composer.utils import reproducibility
+from omegaconf import DictConfig
+from omegaconf import OmegaConf as om
+
+from llmfoundry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.models.hf.hf_fsdp import rgetattr
+from llmfoundry.models.layers.attention import (is_flash_v1_installed,
+                                                is_flash_v2_installed)
+from llmfoundry.utils.builders import build_tokenizer
+
+# Before importing any transformers models, we need to disable transformers flash attention if
+# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
+# gated import otherwise.
+if is_flash_v1_installed():
+    transformers.utils.is_flash_attn_available = lambda: False
+
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+from llmfoundry.models.layers.llama_attention_monkeypatch import (
+    llama_attention_patch_torch, llama_attention_patch_triton)
+
+
+@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
+@pytest.mark.parametrize('explicit_mask', [True, False])
+@pytest.mark.parametrize(
+    'model_name', ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-2-70b-hf'])
+@pytest.mark.gpu
+def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
+                           model_name: str):
+    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+        pytest.skip(
+            'The CI cluster does not have access to the Llama models, so skip this test.'
+        )
+
+    device = 'cuda:0'
+    sequence_length = 4096
+    model_dim = 4096 if '7b' in model_name else 8192
+    batch_size = 2
+    if patch_fn_name == 'torch':
+        patch_fn = llama_attention_patch_torch
+        dtype = torch.float32
+        atol = 0.0
+        rtol = 0.0
+    elif patch_fn_name == 'triton':
+        # the huggingface implementation of llama performs the softmax in fp32
+        # this can result in fairly large differences for the triton implementation
+        # but the torch implementation produces the exact same output so we can confirm
+        # the implementation is correct
+        patch_fn = llama_attention_patch_triton
+        dtype = torch.bfloat16
+        atol = 1e-2
+        rtol = 1e-2
+    else:
+        raise ValueError(f'Unknown patch_fn_name: {patch_fn_name}')
+
+    llama_config = transformers.AutoConfig.from_pretrained(model_name,
+                                                           use_auth_token=True)
+
+    reproducibility.seed_all(42)
+    attention = LlamaAttention(config=llama_config,)
+    attention.to(dtype=dtype, device=device)
+
+    rng = torch.Generator(device=device).manual_seed(42)
+    hidden_states = torch.randn(batch_size,
+                                sequence_length,
+                                model_dim,
+                                generator=rng,
+                                dtype=dtype,
+                                device=device)
+    causal_mask = torch.full((sequence_length, sequence_length),
+                             torch.finfo(torch.float32).min,
+                             device=device)
+    causal_mask = causal_mask.triu(diagonal=1)
+    causal_mask = causal_mask[None,
+                              None, :, :].expand(batch_size, 1, sequence_length,
+                                                 sequence_length)
+    attn_output, _, _ = attention(
+        hidden_states=hidden_states,
+        attention_mask=causal_mask if explicit_mask else None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache=False,
+    )
+
+    reproducibility.seed_all(42)
+    with patch.object(LlamaAttention, 'forward', new=patch_fn):
+        attention = LlamaAttention(config=llama_config,)
+        attention.to(dtype=dtype, device=device)
+        new_output, _, _ = attention(
+            hidden_states=hidden_states,
+            attention_mask=causal_mask if explicit_mask else None,
+            position_ids=None,
+            past_key_value=None,
+            use_cache=False,
+        )
+
+    assert torch.allclose(attn_output, new_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('patch', ['triton', 'torch'])
+def test_attn_patch_integration(patch: str):
+    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+        pytest.skip(
+            'The CI cluster does not have access to the Llama models, so skip this test.'
+        )
+
+    # Save the original attention function to restore at the end of the test.
+    from transformers.models.llama.modeling_llama import LlamaAttention
+    original_attn = LlamaAttention.forward
+
+    name = 'meta-llama/Llama-2-7b-hf'
+    model_cfg = DictConfig({
+        'name': 'hf_causal_lm',
+        'pretrained_model_name_or_path': name,
+        'config_overrides': {
+            'num_hidden_layers': 2,
+            'intermediate_size': 64,
+        },
+        'use_auth_token': True,
+        'pretrained': False,
+        'init_device': 'cpu',
+        'attention_patch_type': patch
+    })
+
+    tokenizer = build_tokenizer(name, tokenizer_kwargs={})
+    tokenizer.pad_token = tokenizer.eos_token
+
+    model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg, tokenizer)
+
+    tokenized_input = tokenizer(['Hello world blah blah', 'Goodbye world'],
+                                return_tensors='pt',
+                                padding=True)
+    tokenized_input['labels'] = tokenized_input['input_ids'].clone()
+
+    tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()}
+    model.to('cuda')
+
+    with get_precision_context('amp_bf16'):
+        # We're just testing that the attention patch runs okay
+        outputs = model(tokenized_input)
+        loss = outputs.loss
+        loss.backward()
+
+    # Ensure the patch does not persist beyond this test.
+    LlamaAttention.forward = original_attn
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('model_name', ['llama2', 'mistral'])
+@pytest.mark.parametrize('use_flash_attention_2', [True, False])
+def test_flash2(model_name: str, use_flash_attention_2: bool):
+    if model_name == 'llama2':
+        if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+            pytest.skip(
+                'The CI cluster does not have access to the Llama models, so skip this test.'
+            )
+        model_cfg = {
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf',
+            'config_overrides': {
+                'num_hidden_layers': 2,
+                'intermediate_size': 64,
+            },
+            'use_auth_token': True,
+            'pretrained': False,
+            'init_device': 'cpu',
+        }
+
+        tokenizer_name = 'meta-llama/Llama-2-7b-hf'
+        from transformers.models.llama.modeling_llama import (
+            LlamaAttention, LlamaFlashAttention2)
+        flash_attn_class = LlamaFlashAttention2 if use_flash_attention_2 else LlamaAttention
+        attention_layers_attr = 'model.model.layers'
+        attention_attr = 'self_attn'
+    elif model_name == 'mistral':
+        model_cfg = {
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+            'config_overrides': {
+                'num_hidden_layers': 2,
+                'intermediate_size': 64,
+            },
+            'pretrained': False,
+            'init_device': 'cpu',
+        }
+
+        tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+        from transformers.models.mistral.modeling_mistral import (
+            MistralAttention, MistralFlashAttention2)
+        flash_attn_class = MistralFlashAttention2 if use_flash_attention_2 else MistralAttention
+        attention_layers_attr = 'model.model.layers'
+        attention_attr = 'self_attn'
+    else:
+        raise ValueError(f'Unknown model: {model_name}')
+
+    if use_flash_attention_2:
+        model_cfg['use_flash_attention_2'] = True
+
+    model_cfg = om.create(model_cfg)
+
+    tokenizer = build_tokenizer(
+        tokenizer_name=tokenizer_name,
+        tokenizer_kwargs={'model_max_length': 10},
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+
+    error_context = pytest.raises(
+        ValueError, match='use_flash_attention_2 is set to True'
+    ) if not is_flash_v2_installed(
+    ) and use_flash_attention_2 else contextlib.nullcontext()
+
+    with error_context:
+        model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg, tokenizer)
+
+        # check that it actually used flash attention 2
+        assert model.model.config._flash_attn_2_enabled if use_flash_attention_2 else not model.model.config._flash_attn_2_enabled
+        attention_layer = rgetattr(
+            rgetattr(model, attention_layers_attr)[0], attention_attr)
+        assert isinstance(attention_layer, flash_attn_class)
+
+        tokenized_input = tokenizer(['Hello world blah blah', 'Goodbye world'],
+                                    return_tensors='pt',
+                                    padding=True)
+        tokenized_input['labels'] = tokenized_input['input_ids'].clone()
+
+        tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()}
+        model.to('cuda')
+
+        with get_precision_context('amp_bf16'):
+            # We're just testing that flash attention 2 runs okay
+            outputs = model(tokenized_input)
+            loss = outputs.loss
+            loss.backward()
diff --git a/tests/test_init_fn.py b/tests/test_init_fn.py
index b054bac186..6be2c5ca42 100644
--- a/tests/test_init_fn.py
+++ b/tests/test_init_fn.py
@@ -8,7 +8,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from torch import nn
@@ -35,8 +34,6 @@ def forward(self, x: torch.Tensor):
 
 @pytest.mark.parametrize('is_residual', [True, False])
 def test_div_is_residual(is_residual: bool):
-    reproducibility.seed_all(7)
-
     in_features, out_features = 8, 32
     cfg = om.create({
         'in_features': in_features,
@@ -64,8 +61,6 @@ def test_div_is_residual(is_residual: bool):
 
 @pytest.mark.parametrize('fused', [True, False])
 def test_fused_init_helper(fused: bool):
-    reproducibility.seed_all(7)
-
     in_features, out_features = 8, 32
     cfg = om.create({
         'in_features': in_features,
@@ -133,8 +128,6 @@ def max_fill_init_(weight: torch.Tensor):
     ('emb_init_uniform_lim', [1, 1])
 ])
 def test_emb_init(emb_init_cfg: Optional[Tuple[str, Union[int, List[int]]]]):
-    reproducibility.seed_all(7)
-
     cfg: Dict[str, Union[int, List[int]]] = {
         'vocab_size': 64,
         'in_features': 16,
diff --git a/tests/test_lion8b.py b/tests/test_lion8b.py
index ddb70e882b..0c7010ce9f 100644
--- a/tests/test_lion8b.py
+++ b/tests/test_lion8b.py
@@ -24,6 +24,7 @@
     LocalOptimStateDictConfig = MagicMock()
     ShardedOptimStateDictConfig = MagicMock()
 
+from llmfoundry.optim import DecoupledLionW
 from llmfoundry.optim import DecoupledLionW_8bit as Lion8bit
 
 warnings.filterwarnings('ignore')
@@ -406,8 +407,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:  # type:ignore
 @pytest.mark.parametrize('use_errors', [False, True])
 @pytest.mark.parametrize('state_sharding',
                          [_FULL_STATE, _SHARDED_STATE, _LOCAL_STATE])
+@pytest.mark.parametrize('save_as_lion8b, load_as_lion8b', [(False, True),
+                                                            (True, False),
+                                                            (True, True)])
 def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
-                        state_sharding: fsdp.StateDictType):
+                        state_sharding: fsdp.StateDictType,
+                        save_as_lion8b: bool, load_as_lion8b: bool):
     device = 'cuda'
     if torch.cuda.device_count() < 2:
         pytest.skip(f'This test requires 2+ GPUs.')
@@ -419,6 +424,10 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
         dist.init_process_group(backend='nccl')
     assert dist.get_world_size() >= 2, 'Misconfigured test run!'
 
+    # nb: this is the line that causes:
+    #   `Warning: Deallocating Tensor that still has live PyObject references.`
+    # suggesting this warning isn't an issue with our test code. It's also
+    # going to stdout (probably from cpp) so we can't suppress it with warnings
     mod = FSDP(_DummyModule(device=device, dtype=dtype))
 
     # actual forward pass instead of setting p.grad to avoid FSDP issues
@@ -429,7 +438,10 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
         p.grad = torch.rand_like(p)
 
     # create optimizer and have it step so that state gets populated
-    opt = Lion8bit(mod.parameters(), error_correction=use_errors)
+    if save_as_lion8b:
+        opt = Lion8bit(mod.parameters(), error_correction=use_errors)
+    else:
+        opt = DecoupledLionW(mod.parameters())
     opt.step()
     opt.zero_grad()
 
@@ -449,13 +461,22 @@ def _set_state_dict_type(model: nn.Module):
         FSDP.set_state_dict_type(model, state_sharding, state_dict_cfg,
                                  optim_cfg)
 
+    def _local_shard(t: torch.Tensor) -> torch.Tensor:
+        try:  # can't operate on ShardedTensors directly
+            return t.local_tensor()  # type: ignore
+        except AttributeError:
+            return t
+
     # load FSDP state dict
     _set_state_dict_type(mod)
     opt_state_dict = FSDP.optim_state_dict(mod, opt)
 
     # make a new model and optimizer
     mod_new = FSDP(_DummyModule(device=device, dtype=dtype))
-    opt_new = Lion8bit(mod_new.parameters(), error_correction=use_errors)
+    if load_as_lion8b:
+        opt_new = Lion8bit(mod_new.parameters(), error_correction=use_errors)
+    else:
+        opt_new = DecoupledLionW(mod_new.parameters())
     _set_state_dict_type(mod_new)
 
     # load state dict into the new optimizer
@@ -480,22 +501,26 @@ def _set_state_dict_type(model: nn.Module):
         mom_new = d_new['exp_avg']
 
         assert mom_orig.shape == mom_new.shape
-        assert mom_orig.dtype == mom_new.dtype
-        if use_errors and (dtype != torch.float32):
-            errs_orig = d_orig['errors']
-            errs_new = d_new['errors']
-            assert errs_orig.shape == errs_new.shape
-            assert errs_orig.dtype == errs_new.dtype
-
-        if state_sharding != _FULL_STATE:
-            continue  # more detailed checks lean on FSDP impl details
+        both_lion8b = save_as_lion8b and load_as_lion8b
+        check_errors = both_lion8b and use_errors and (dtype != torch.float32)
+        if both_lion8b:
+            assert mom_orig.dtype == mom_new.dtype
+            if check_errors:
+                errs_orig = d_orig['errors']
+                errs_new = d_new['errors']
+                assert errs_orig.shape == errs_new.shape
+                assert errs_orig.dtype == errs_new.dtype
 
         # momentums may not be bit-for-bit identical because Optimizer upcasts
         # to f32 and we convert back to bf16, possibly with different rounding
-        torch.testing.assert_close(mom_orig, mom_new)
+        torch.testing.assert_close(_local_shard(mom_orig).float(),
+                                   _local_shard(mom_new).float(),
+                                   atol=1e-4,
+                                   rtol=1. / 128)
         # errors not bit-for-bit identical because scales get upcast too
-        if use_errors and (dtype != torch.float32):
-            torch.testing.assert_close(d_orig['errors'], d_new['errors'])
+        if check_errors:
+            torch.testing.assert_close(_local_shard(d_orig['errors']),
+                                       _local_shard(d_new['errors']))
 
 
 @pytest.mark.gpu
diff --git a/tests/test_llama_patch.py b/tests/test_llama_patch.py
deleted file mode 100644
index b1cd3711e0..0000000000
--- a/tests/test_llama_patch.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2022 MosaicML LLM Foundry authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import pytest
-import torch
-import transformers
-from composer.utils import reproducibility
-from transformers.models.llama.modeling_llama import LlamaAttention
-
-from llmfoundry.models.layers.llama_attention_monkeypatch import (
-    llama_attention_patch_torch, llama_attention_patch_triton)
-
-
-@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
-@pytest.mark.parametrize('explicit_mask', [True, False])
-@pytest.mark.parametrize(
-    'model_name', ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-2-70b-hf'])
-@pytest.mark.gpu
-def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
-                           model_name: str):
-    if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
-        pytest.skip(
-            'The CI cluster does not have access to the Llama models, so skip this test.'
-        )
-
-    original_forward = LlamaAttention.forward
-
-    device = 'cuda:0'
-    sequence_length = 4096
-    model_dim = 4096 if '7b' in model_name else 8192
-    batch_size = 2
-    if patch_fn_name == 'torch':
-        patch_fn = llama_attention_patch_torch
-        dtype = torch.float32
-        atol = 0.0
-        rtol = 0.0
-    elif patch_fn_name == 'triton':
-        # the huggingface implementation of llama performs the softmax in fp32
-        # this can result in fairly large differences for the triton implementation
-        # but the torch implementation produces the exact same output so we can confirm
-        # the implementation is correct
-        patch_fn = llama_attention_patch_triton
-        dtype = torch.bfloat16
-        atol = 1e-2
-        rtol = 1e-2
-    else:
-        raise ValueError(f'Unknown patch_fn_name: {patch_fn_name}')
-
-    llama_config = transformers.AutoConfig.from_pretrained(model_name,
-                                                           use_auth_token=True)
-
-    reproducibility.seed_all(42)
-    attention = LlamaAttention(config=llama_config,)
-    attention.to(dtype=dtype, device=device)
-
-    rng = torch.Generator(device=device).manual_seed(42)
-    hidden_states = torch.randn(batch_size,
-                                sequence_length,
-                                model_dim,
-                                generator=rng,
-                                dtype=dtype,
-                                device=device)
-    causal_mask = torch.full((sequence_length, sequence_length),
-                             torch.finfo(torch.float32).min,
-                             device=device)
-    causal_mask = causal_mask.triu(diagonal=1)
-    causal_mask = causal_mask[None,
-                              None, :, :].expand(batch_size, 1, sequence_length,
-                                                 sequence_length)
-    attn_output, _, _ = attention(
-        hidden_states=hidden_states,
-        attention_mask=causal_mask if explicit_mask else None,
-        position_ids=None,
-        past_key_value=None,
-        use_cache=False,
-    )
-
-    reproducibility.seed_all(42)
-    LlamaAttention.forward = patch_fn
-    attention = LlamaAttention(config=llama_config,)
-    attention.to(dtype=dtype, device=device)
-    new_output, _, _ = attention(
-        hidden_states=hidden_states,
-        attention_mask=causal_mask if explicit_mask else None,
-        position_ids=None,
-        past_key_value=None,
-        use_cache=False,
-    )
-
-    # Reset the forward function so patches don't persist
-    LlamaAttention.forward = original_forward
-
-    assert torch.allclose(attn_output, new_output, atol=atol, rtol=rtol)
diff --git a/tests/test_model.py b/tests/test_model.py
index 6ea530731a..8ec19bb27b 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -12,7 +12,9 @@
 import pytest
 import torch
 import torch.nn as nn
+from torch.distributed._tensor.api import DTensor
 from accelerate import init_empty_weights
+from composer import Trainer
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
@@ -25,12 +27,13 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.bloom.modeling_bloom import build_alibi_tensor
 
-from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM,
-                        ComposerHFPrefixLM)
+from llmfoundry import COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers import NORM_CLASS_REGISTRY, build_alibi_bias
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
+from llmfoundry.models.mpt.modeling_mpt import rearrange_tensor
 from llmfoundry.utils import build_tokenizer
 
 
@@ -56,8 +59,6 @@ def get_objs(conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml'):
         message='Torchmetrics v0.9 introduced a new argument class property')
     test_cfg = get_config(conf_path=conf_path)
 
-    reproducibility.seed_all(test_cfg.seed)
-
     # Read FSDP Config as a dict
     fsdp_config = test_cfg.get('fsdp_config', None)
     fsdp_config = om.to_container(fsdp_config,
@@ -306,18 +307,13 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2):
     assert not torch.equal(original_params, updated_params)
 
 
+@pytest.mark.gpu
 @pytest.mark.parametrize(
     'attn_impl,precision',
     [('torch', torch.float16), ('torch', torch.bfloat16),
      pytest.param('flash', torch.float16, marks=pytest.mark.gpu),
      pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)])
 def test_determinism(attn_impl: str, precision: torch.dtype):
-    if not torch.cuda.is_available():
-        pytest.skip(
-            'This test requires CUDA to be available in order to run with bfloat16 precision.'
-        )
-    reproducibility.seed_all(1111)
-
     conf_path = 'scripts/train/yamls/pretrain/testing.yaml'
     with open(conf_path) as f:
         test_cfg = om.load(f)
@@ -394,8 +390,6 @@ def test_loss_fn():
         'init_std': 0.02,
     }
 
-    reproducibility.seed_all(test_cfg.get('global_seed', 42))
-
     tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer)
     tokenizer = build_tokenizer(test_cfg.tokenizer.name,
                                 tokenizer_cfg.get('kwargs', {}))
@@ -443,11 +437,10 @@ def test_loss_fn():
                                     atol=1e-4), f'differed at step {i}'
 
 
-@pytest.mark.parametrize('prefixlm', [False, True])
-def test_opt_wrapping(prefixlm: bool):
+def test_opt_wrapping():
     conf = {
         'model': {
-            'name': 'hf_prefix_lm' if prefixlm else 'hf_causal_lm',
+            'name': 'hf_causal_lm',
             'pretrained_model_name_or_path': 'facebook/opt-125m',
             'pretrained': 'false'
         },
@@ -461,10 +454,7 @@ def test_opt_wrapping(prefixlm: bool):
     tokenizer = build_tokenizer(config.tokenizer.name,
                                 tokenizer_cfg.get('kwargs', {}))
 
-    if prefixlm:
-        model = ComposerHFPrefixLM(config.model, tokenizer)
-    else:
-        model = ComposerHFCausalLM(config.model, tokenizer)
+    model = ComposerHFCausalLM(config.model, tokenizer)
 
     # check that all the modules we except are blocked from FSDP wrapping
     assert not model.model.model._fsdp_wrap
@@ -475,7 +465,8 @@ def test_opt_wrapping(prefixlm: bool):
 
 @pytest.mark.parametrize('norm_type', NORM_CLASS_REGISTRY.keys())
 @pytest.mark.parametrize('no_bias', [False, True])
-def test_mpt_creation(norm_type: str, no_bias: bool):
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
     # Test that the config constructs the model as expected.
     hf_config = MPTConfig(
         init_device='cpu',
@@ -491,6 +482,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
         },
         norm_type=norm_type,
         no_bias=no_bias,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
 
@@ -502,6 +494,9 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
 
     assert mpt.transformer.wte.weight.shape == torch.Size(
         [hf_config.vocab_size, hf_config.d_model])
+    if not tie_word_embeddings:
+        assert mpt.lm_head is not None
+        assert mpt.lm_head.weight.shape == mpt.transformer.wte.weight.shape
     assert mpt.transformer.wpe.weight.shape == torch.Size(
         [hf_config.max_seq_len, hf_config.d_model])
     assert mpt.transformer.emb_drop.p == 0.1
@@ -523,22 +518,53 @@ def test_mpt_creation(norm_type: str, no_bias: bool):
         assert block.resid_ffn_dropout.p == 0.2
 
 
-@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
-                                                   ('flash', 'gpu'),
-                                                   ('triton', 'gpu'),
-                                                   ('torch', 'gpu')])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('attention_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu)
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
+                              tie_word_embeddings: bool):
     # Test that different placement of padding does not affect the output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
+    alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    reproducibility.seed_all(1234)
-    composer_device = get_device(device)
+    rope = pos_emb_config['rope']
+    if rope and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -551,12 +577,13 @@ def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()
@@ -623,23 +650,35 @@ def test_forward_with_padding(attention_impl: str, device: str, alibi: bool):
                              attention_mask=batched_attention_mask).logits
 
         # check that right padding and left padding produce the same output
+        right_pad_v_left_pad_rtol = 1e-5
+        right_pad_v_left_pad_atol = 1e-6 if attention_impl == 'torch' else 1e-8
+        if rope and pos_emb_config['rope_impl'] == 'dail':
+            # dail implementation of rope uses bf16 precision and hence the rotations have small numerical errors. This causes some differences between the outputs of padded and unpadded inputs.
+            right_pad_v_left_pad_rtol = 1e-2
+            right_pad_v_left_pad_atol = 1e-2
         assert torch.allclose(right_padding_output[0, :3],
                               left_padding_output[0, 3:],
-                              atol=1e-6 if attention_impl == 'torch' else 1e-8)
-        if not alibi:
+                              rtol=right_pad_v_left_pad_rtol,
+                              atol=right_pad_v_left_pad_atol)
+
+        if not (alibi or (rope and pos_emb_config['rope_impl'] == 'dail')):
             # check that right padding and middle padding produce the same output
             # Note: alibi not implemented for middle padding.
+            # Note: dail implementation of rope does not support middle padding.
             assert torch.allclose(
                 right_padding_output[0, :3],
                 middle_padding_output[0, [0, 1, 5]],
                 atol=1e-6 if attention_impl == 'torch' else 1e-8)
+
         # check that right padding and right padding in a batch produce the same output
         assert torch.allclose(right_padding_output[0, :3],
                               batched_output[0, :3],
                               atol=1e-6 if attention_impl == 'torch' else 1e-8)
-        if not alibi:
+
+        if not (alibi or (rope and pos_emb_config['rope_impl'] == 'dail')):
             # check that middle padding and middle padding in a batch produce the same output
             # Note: alibi not implemented for middle padding.
+            # Note: dail implementation of rope does not support middle padding.
             assert torch.allclose(
                 middle_padding_output[0],
                 batched_output[1, :],
@@ -701,23 +740,55 @@ def test_advanced_mask_building(attention_impl: str):
     assert torch.equal(attn_bias, expected_attn_bias)
 
 
-@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
-                                                   ('flash', 'gpu'),
-                                                   ('triton', 'gpu'),
-                                                   ('torch', 'gpu')])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generate(attention_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('attention_impl,precision', [
+    ('torch', 'fp32'),
+    pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'fp32', marks=pytest.mark.gpu),
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
+                  tie_word_embeddings: bool):
     # Test that generate works, and produces the same output with or without
     # padding in the input.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
-    if alibi and attention_impl == 'flash':
+    if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    reproducibility.seed_all(1234)
-    composer_device = get_device(device)
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+    if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False:
+        pytest.skip(f'This test configuration has precision / sampling issues.')
+
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -730,12 +801,13 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
-    mpt.eval()
     mpt = composer_device.module_to_device(mpt)
+    mpt.eval()
 
     # padding on the left of the input
     left_padding_input_ids = torch.tensor(
@@ -766,8 +838,7 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
     batched_attention_mask = composer_device.tensor_to_device(
         batched_attention_mask)
 
-    with get_precision_context('amp_bf16' if composer_device.name ==
-                               'gpu' else 'fp32'):
+    with get_precision_context(precision):
         # check that a batch with different amounts of padding doesn't crash
         # and produces the right output shape
         batched_generation = mpt.generate(input_ids=batched_input_ids,
@@ -776,14 +847,12 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
                                           use_cache=False)
         assert batched_generation.shape == (2, 6 + 5)
 
-        reproducibility.seed_all(1234)
         generation_with_left_padding = mpt.generate(
             input_ids=left_padding_input_ids,
             attention_mask=left_padding_attention_mask,
             max_new_tokens=5,
             use_cache=False)
         assert generation_with_left_padding.shape == (2, 6 + 5)
-        reproducibility.seed_all(1234)
         generation_with_no_padding = mpt.generate(
             input_ids=no_padding_input_ids,
             attention_mask=no_padding_attention_mask,
@@ -799,10 +868,9 @@ def test_generate(attention_impl: str, device: str, alibi: bool):
 @pytest.mark.gpu
 @pytest.mark.parametrize('world_size', [1, 2])
 @pytest.mark.parametrize('use_cache', [False, True])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
-                                  use_cache: bool):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
+                                  use_cache: bool, tie_word_embeddings: bool):
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
@@ -820,6 +888,7 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
             'attn_impl': 'torch',
         },
         use_cache=use_cache,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.save_pretrained(save_path)
@@ -900,9 +969,51 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     check_hf_model_equivalence(mpt, mpt2)
 
 
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_cache_and_padding(alibi: bool):
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
+                                        tie_word_embeddings: bool):
     # Tests that the result is the same with or without padding when using kv caching
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(None)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -913,78 +1024,134 @@ def test_forward_with_cache_and_padding(alibi: bool):
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
 
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
-
-    first_input_ids_no_padding = torch.tensor([[11274, 16390, 11]])
-    first_attention_mask_no_padding = torch.tensor([[1, 1, 1]]).bool()
-
-    # start with passing the first three tokens through (no padding)
-    first_output_no_padding = mpt(
-        first_input_ids_no_padding,
-        attention_mask=first_attention_mask_no_padding)
-
-    second_input_ids_no_padding = torch.tensor([[11274, 16390, 11, 11274]])
-    second_attention_mask_no_padding = torch.tensor([[1, 1, 1, 1]]).bool()
-
-    # pass through the fourth token by itself, using the key-value cache (no padding)
-    second_output_no_padding = mpt(
-        second_input_ids_no_padding[:, -1].unsqueeze(-1),
-        attention_mask=second_attention_mask_no_padding,
-        past_key_values=first_output_no_padding.past_key_values)
-
-    first_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11]])
-    first_attention_mask_padding = torch.tensor([[0, 1, 1, 1]]).bool()
-
-    # start with passing the first three tokens through (with left padding)
-    first_output_padding = mpt(first_input_ids_padding,
-                               attention_mask=first_attention_mask_padding)
-
-    second_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11, 11274]])
-    second_attention_mask_padding = torch.tensor([[0, 1, 1, 1, 1]]).bool()
-
-    # pass through the fourth token by itself, using the key-value cache (with left padding)
-    second_output_padding = mpt(
-        second_input_ids_padding[:, -1].unsqueeze(-1),
-        attention_mask=second_attention_mask_padding,
-        past_key_values=first_output_padding.past_key_values)
-
-    # check that the outputs are the same with or without padding
-    torch.testing.assert_close(second_output_no_padding.logits,
-                               second_output_padding.logits[:,
-                                                            -1, :].unsqueeze(1),
-                               atol=1e-6,
-                               rtol=1e-6)
-
-
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        first_input_ids_no_padding = torch.tensor([[11274, 16390, 11]])
+        first_input_ids_no_padding = composer_device.tensor_to_device(
+            first_input_ids_no_padding)
+        first_attention_mask_no_padding = torch.tensor([[1, 1, 1]]).bool()
+        first_attention_mask_no_padding = composer_device.tensor_to_device(
+            first_attention_mask_no_padding)
+
+        # start with passing the first three tokens through (no padding)
+        first_output_no_padding = mpt(
+            first_input_ids_no_padding,
+            attention_mask=first_attention_mask_no_padding)
+
+        second_input_ids_no_padding = torch.tensor([[11274, 16390, 11, 11274]])
+        second_input_ids_no_padding = composer_device.tensor_to_device(
+            second_input_ids_no_padding)
+        second_attention_mask_no_padding = torch.tensor([[1, 1, 1, 1]]).bool()
+        second_attention_mask_no_padding = composer_device.tensor_to_device(
+            second_attention_mask_no_padding)
+
+        # pass through the fourth token by itself, using the key-value cache (no padding)
+        second_output_no_padding = mpt(
+            second_input_ids_no_padding[:, -1].unsqueeze(-1),
+            attention_mask=second_attention_mask_no_padding,
+            past_key_values=first_output_no_padding.past_key_values)
+
+        first_input_ids_padding = torch.tensor([[50256, 11274, 16390, 11]])
+        first_input_ids_padding = composer_device.tensor_to_device(
+            first_input_ids_padding)
+        first_attention_mask_padding = torch.tensor([[0, 1, 1, 1]]).bool()
+        first_attention_mask_padding = composer_device.tensor_to_device(
+            first_attention_mask_padding)
+
+        # start with passing the first three tokens through (with left padding)
+        first_output_padding = mpt(first_input_ids_padding,
+                                   attention_mask=first_attention_mask_padding)
+
+        second_input_ids_padding = torch.tensor(
+            [[50256, 11274, 16390, 11, 11274]])
+        second_input_ids_padding = composer_device.tensor_to_device(
+            second_input_ids_padding)
+        second_attention_mask_padding = torch.tensor([[0, 1, 1, 1, 1]]).bool()
+        second_attention_mask_padding = composer_device.tensor_to_device(
+            second_attention_mask_padding)
+
+        # pass through the fourth token by itself, using the key-value cache (with left padding)
+        second_output_padding = mpt(
+            second_input_ids_padding[:, -1].unsqueeze(-1),
+            attention_mask=second_attention_mask_padding,
+            past_key_values=first_output_padding.past_key_values)
+
+        # check that the outputs are the same with or without padding
+        if pos_emb_config['rope'] and pos_emb_config[
+                'rope_impl'] == 'dail':  # dail implementation of rope uses bf16 precision and hence the rotations have small numerical errors. This causes some differences between the outputs of padded and unpadded inputs.
+            torch.testing.assert_close(
+                second_output_no_padding.logits,
+                second_output_padding.logits[:, -1, :].unsqueeze(1),
+                atol=1e-2,
+                rtol=1e-6)
+        else:
+            torch.testing.assert_close(
+                second_output_no_padding.logits,
+                second_output_padding.logits[:, -1, :].unsqueeze(1),
+                atol=1e-6,
+                rtol=1e-6)
+
+
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
+                            tie_word_embeddings: bool):
     # Test that model forward with and without the key-value cache produces the
     # same output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
-    if alibi and attn_impl == 'flash':
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    composer_device = get_device(device)
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -997,24 +1164,21 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attn_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
-        attn_impl=attn_impl,
-        alibi=alibi,
         use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
-        reproducibility.seed_all(1234)
         first_input_ids = torch.tensor([[11274, 16390, 11]])
         first_input_ids = composer_device.tensor_to_device(first_input_ids)
         first_attention_mask = torch.tensor([[1, 1, 1]]).bool()
@@ -1040,7 +1204,6 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
             assert all(past_key_value[1].shape == (1, 3, 128)
                        for past_key_value in first_output.past_key_values)
 
-        reproducibility.seed_all(1234)
         second_input_ids = torch.tensor([[11274, 16390, 11, 11274]])
         second_input_ids = composer_device.tensor_to_device(second_input_ids)
         second_attention_mask = torch.tensor([[1, 1, 1, 1]]).bool()
@@ -1070,7 +1233,6 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
             assert all(past_key_value[1].shape == (1, 4, 128)
                        for past_key_value in second_output.past_key_values)
 
-        reproducibility.seed_all(1234)
         # pass through the first four tokens without the key-value cache
         full_output = mpt(second_input_ids,
                           attention_mask=second_attention_mask)
@@ -1079,13 +1241,55 @@ def test_forward_with_cache(attn_impl: str, device: str, alibi: bool):
         torch.testing.assert_close(
             second_output.logits,
             full_output.logits[:, -1, :].unsqueeze(1),
-            atol=1e-2,
+            atol=1.1e-2,
             rtol=1e-2,
         )
 
 
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generate_with_past_kv(alibi: bool):
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
+])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
+                               tie_word_embeddings: bool):
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    composer_device = get_device(None)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1096,43 +1300,58 @@ def test_generate_with_past_kv(alibi: bool):
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     # no padding in the input
     no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
+    no_padding_input_ids = composer_device.tensor_to_device(
+        no_padding_input_ids)
     no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+    no_padding_attention_mask = composer_device.tensor_to_device(
+        no_padding_attention_mask)
 
-    with mock.patch.object(MPTForCausalLM, 'forward',
-                           autospec=True) as forward_mocked:
-        forward_mocked.return_value = CausalLMOutputWithPast(
-            logits=torch.randn((1, 3, hf_config.vocab_size)),
-            past_key_values=[(torch.randn(1, 3, hf_config.d_model),
-                              torch.randn(1, 3, hf_config.d_model))
-                             for _ in range(hf_config.n_layers)])
-        _ = mpt.generate(input_ids=no_padding_input_ids,
-                         attention_mask=no_padding_attention_mask,
-                         max_new_tokens=2)
-
-        assert forward_mocked.call_count == 2
-        _, _, kwargs = forward_mocked.mock_calls[0]
-        assert kwargs['past_key_values'] is None
-        _, _, kwargs = forward_mocked.mock_calls[1]
-        assert kwargs['past_key_values'] is not None
-        assert len(kwargs['past_key_values']) == hf_config.n_layers
-        assert kwargs['past_key_values'][0][0].shape == (1, 3,
-                                                         hf_config.d_model)
-
-
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        with mock.patch.object(MPTForCausalLM, 'forward',
+                               autospec=True) as forward_mocked:
+            forward_mocked.return_value = CausalLMOutputWithPast(
+                logits=composer_device.tensor_to_device(
+                    torch.randn((1, 3, hf_config.vocab_size))),
+                past_key_values=[(torch.randn(1, 3, hf_config.d_model),
+                                  torch.randn(1, 3, hf_config.d_model))
+                                 for _ in range(hf_config.n_layers)])
+            _ = mpt.generate(input_ids=no_padding_input_ids,
+                             attention_mask=no_padding_attention_mask,
+                             max_new_tokens=2)
+
+            assert forward_mocked.call_count == 2
+            _, _, kwargs = forward_mocked.mock_calls[0]
+            assert kwargs['past_key_values'] is None
+            _, _, kwargs = forward_mocked.mock_calls[1]
+            assert kwargs['past_key_values'] is not None
+            assert len(kwargs['past_key_values']) == hf_config.n_layers
+            assert kwargs['past_key_values'][0][0].shape == (1, 3,
+                                                             hf_config.d_model)
+
+
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
+])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,
     'num_beams': 4
@@ -1144,9 +1363,49 @@ def test_generate_with_past_kv(alibi: bool):
     'do_sample': True,
     'top_p': 0.95
 }])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_generation_kwargs_dont_crash(generation_kwargs: Dict[str, Any],
-                                      alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_generation_kwargs_dont_crash(attn_impl: str,
+                                      generation_kwargs: Dict[str, Any],
+                                      pos_emb_config: dict,
+                                      tie_word_embeddings: bool):
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+    composer_device = get_device(None)
+
+    if composer_device.name == 'gpu':
+        torch.use_deterministic_algorithms(False)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1157,35 +1416,73 @@ def test_generation_kwargs_dont_crash(generation_kwargs: Dict[str, Any],
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': 'torch',
-            'alibi': alibi,
+            'attn_impl': attn_impl,
+            **pos_emb_config,
         },
         use_cache=True,
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
+    mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
-    # no padding in the input
-    no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
-    no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        # no padding in the input
+        no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
+        no_padding_input_ids = composer_device.tensor_to_device(
+            no_padding_input_ids)
+        no_padding_attention_mask = torch.tensor([[1, 1, 1]])
+        no_padding_attention_mask = composer_device.tensor_to_device(
+            no_padding_attention_mask)
+
+        _ = mpt.generate(input_ids=no_padding_input_ids,
+                         attention_mask=no_padding_attention_mask,
+                         **generation_kwargs)
 
-    _ = mpt.generate(input_ids=no_padding_input_ids,
-                     attention_mask=no_padding_attention_mask,
-                     **generation_kwargs)
+    if composer_device.name == 'gpu':
+        reproducibility.configure_deterministic_mode()
 
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton'])
-@pytest.mark.parametrize('alibi', [True, False])
-def test_model_to(attention_impl: str, alibi: bool):
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_model_to(attention_impl: str, pos_emb_config: dict,
+                  tie_word_embeddings: bool):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
-    if not torch.cuda.is_available():
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
-    if alibi and attention_impl == 'flash':
+    if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(f'dail implementation of rope requires flash attention 2.')
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1197,15 +1494,15 @@ def test_model_to(attention_impl: str, alibi: bool):
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
         use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = mpt.bfloat16()
     mpt = mpt.to('cuda')
@@ -1223,7 +1520,8 @@ def test_model_to(attention_impl: str, alibi: bool):
     mpt = mpt.to('cpu')
 
     # verify the model still works
-    if attention_impl == 'torch':
+    if attention_impl == 'torch' and not (
+            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         with torch.autocast('cpu', dtype=torch.bfloat16, enabled=True):
             _ = mpt(input_ids.to('cpu'),
                     attention_mask=attention_mask.to('cpu'))
@@ -1240,7 +1538,8 @@ def test_model_to(attention_impl: str, alibi: bool):
     mpt = mpt.float()
 
     # verify the model still works
-    if attention_impl == 'torch':
+    if attention_impl == 'torch' and not (
+            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu'))
 
     mpt = mpt.half()
@@ -1271,29 +1570,55 @@ def test_alibi_vs_hf():
             torch.testing.assert_close(alibi_bias_hf, alibi_bias_m)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
-@pytest.mark.parametrize('alibi', [True, False])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': False,
+    'rope': False
+}, {
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
 @pytest.mark.parametrize('output_attentions', [True, False])
 @pytest.mark.parametrize('output_hidden_states', [True, False])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, device: str, alibi: bool, output_attentions: bool,
-        output_hidden_states: bool):
+        attn_impl: str, pos_emb_config: dict, output_attentions: bool,
+        output_hidden_states: bool, tie_word_embeddings: bool):
     # Test that model forward with output_attentions_and_output_hidden_states
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
-    if alibi and attn_impl == 'flash':
+    if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if output_attentions and attn_impl in ['flash', 'triton']:
         pytest.skip(f'output_attentions only implemented with torch attention.')
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     n_layers = 2
 
@@ -1308,24 +1633,21 @@ def test_forward_with_output_attentions_and_output_hidden_states(
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attn_impl,
-            'alibi': alibi,
+            **pos_emb_config,
         },
-        attn_impl=attn_impl,
-        alibi=alibi,
         use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
         },
+        tie_word_embeddings=tie_word_embeddings,
     )
-    reproducibility.seed_all(1234)
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
     mpt.eval()
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
-        reproducibility.seed_all(1234)
         input_ids = torch.tensor([[11274, 16390, 11]])
         input_ids = composer_device.tensor_to_device(input_ids)
         attention_mask = torch.tensor([[1, 1, 1]]).bool()
@@ -1354,8 +1676,6 @@ def test_hf_init(tmp_path: pathlib.Path,
                  init_device: str,
                  world_size: int,
                  batch_size: int = 1):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
@@ -1483,3 +1803,99 @@ def test_head_dim_8_triton_mqa_attn(batch_size: int = 2):
         output = model(batch)
 
     assert not torch.isnan(output.logits).any()
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+def test_tp_qkvo():
+    # Note: we need the RNG state in this test to ensure that weights
+    # are initialized with the same values in both models. Without it,
+    # even with a random seed, the weights will be different since the
+    # RNG state changes with each init.
+    rng_state = reproducibility.get_rng_state()
+
+    local_world_size = dist.get_local_world_size()
+    sharded_model_cfg = {
+        'name': 'mpt_causal_lm',
+        'init_device': 'cpu',
+        'd_model': 128,
+        'n_heads': 4, # head size 32
+        'n_layers': 2,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_type': 'multihead_attention',
+            'alibi': False,
+            'attn_impl': 'torch',
+            'tensor_parallel_qkvo': True,
+            'tp_world_size': local_world_size 
+        }
+    }
+
+    # Create the same model config, but with TP turned off
+    full_model_cfg = copy.deepcopy(sharded_model_cfg)
+    full_model_cfg['attn_config']['tensor_parallel_qkvo'] = False
+    del full_model_cfg['attn_config']['tp_world_size']
+
+    sharded_model_cfg = om.create(sharded_model_cfg)
+    full_model_cfg = om.create(full_model_cfg)
+
+    sharded_model = COMPOSER_MODEL_REGISTRY[sharded_model_cfg.name](sharded_model_cfg)
+    reproducibility.load_rng_state(rng_state)
+    
+    full_model = COMPOSER_MODEL_REGISTRY[full_model_cfg.name](full_model_cfg)
+
+    fsdp_config = {
+       'sharding_strategy': 'NO_SHARD',
+       'mixed_precision': 'DEFAULT'
+    }
+    # The trainer is used to wrap the model in FSDP, which can be used
+    # alongside with TP for 2D parallelism
+    trainer = Trainer(
+        model=sharded_model,
+        fsdp_config=fsdp_config,
+    )
+
+    trainer = Trainer(
+        model=full_model,
+        fsdp_config=fsdp_config,
+    )
+
+    sharded_transformer_blocks = sharded_model.model.transformer.blocks
+    full_transformer_blocks = full_model.model.transformer.blocks
+    for sharded_block, full_block in zip(sharded_transformer_blocks, full_transformer_blocks):
+        sharded_attn_module = sharded_block._fsdp_wrapped_module.attn
+        full_attn_module = full_block._fsdp_wrapped_module.attn
+
+        # Check that all attention module weights are DTensors
+        assert isinstance(sharded_attn_module.Wqkv.weight, DTensor)
+        assert isinstance(sharded_attn_module.out_proj.weight, DTensor)
+
+        Wqkv_local = sharded_attn_module.Wqkv.weight._local_tensor
+        out_proj_local = sharded_attn_module.out_proj.weight._local_tensor
+
+        # Wqkv is colwise-sharded, so its output dimension (dim 0 since torch
+        # stores everything along the transpose) is sharded along the device mesh
+        assert Wqkv_local.shape[0] * local_world_size == sharded_model_cfg.d_model * 3
+
+        # The out projection is rowwise-sharded, so its input dimension (dim 1)
+        # is sharded along the device mesh
+        assert out_proj_local.shape[1] * local_world_size == sharded_model_cfg.d_model
+    
+        Wqkv_interleaved = rearrange_tensor(
+            full_attn_module.Wqkv.weight, 
+            local_world_size,
+            sharded_model_cfg.d_model,
+            sharded_model_cfg.d_model // sharded_model_cfg.n_heads,
+            sharded_model_cfg.n_heads
+        )
+        # Check that the sharded output weights are the same as the full model
+        # weights:
+        #   rank 0 should have the top half of out proj and the left half of Wqkv
+        #   rank 1 should have the bottom half of out proj and the right half of Wqkv
+        if dist.get_local_rank() == 0:
+            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, :out_proj_local.shape[1]])
+            assert torch.equal(Wqkv_local, Wqkv_interleaved[:Wqkv_local.shape[0], :])
+        else:
+            assert torch.equal(out_proj_local, full_attn_module.out_proj.weight[:, out_proj_local.shape[1]:])
+            assert torch.equal(Wqkv_local, Wqkv_interleaved[Wqkv_local.shape[0]:, :])
\ No newline at end of file
diff --git a/tests/test_model_download_utils.py b/tests/test_model_download_utils.py
new file mode 100644
index 0000000000..27b9805cda
--- /dev/null
+++ b/tests/test_model_download_utils.py
@@ -0,0 +1,248 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import unittest.mock as mock
+from http import HTTPStatus
+from typing import Any, Dict, List
+from unittest.mock import MagicMock
+from urllib.parse import urljoin
+
+import pytest
+import requests
+import tenacity
+from huggingface_hub.utils import RepositoryNotFoundError
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import WEIGHTS_INDEX_NAME as PYTORCH_WEIGHTS_INDEX_NAME
+from transformers.utils import WEIGHTS_NAME as PYTORCH_WEIGHTS_NAME
+
+from llmfoundry.utils.model_download_utils import (DEFAULT_IGNORE_PATTERNS,
+                                                   PYTORCH_WEIGHTS_PATTERN,
+                                                   SAFE_WEIGHTS_PATTERN,
+                                                   download_from_cache_server,
+                                                   download_from_hf_hub)
+
+# ======================== download_from_hf_hub tests ========================
+
+
+@pytest.mark.parametrize(
+    ['prefer_safetensors', 'repo_files', 'expected_ignore_patterns'],
+    [
+        [  # Should use default ignore if only safetensors available
+            True,
+            [SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only safetensors available
+            False,
+            [SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [  # Should use default ignore if only sharded safetensors available
+            True,
+            [SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded safetensors available
+            False,
+            [SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only pytorch available
+            True,
+            [PYTORCH_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only pytorch available
+            False,
+            [PYTORCH_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded pytorch available
+            True,
+            [PYTORCH_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [
+            # Should use default ignore if only sharded pytorch available
+            False,
+            [PYTORCH_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS,
+        ],
+        [  # Ignore pytorch if safetensors are preferred
+            True,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [PYTORCH_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore safetensors if pytorch is preferred
+            False,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [SAFE_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore pytorch if safetensors are preferred
+            True,
+            [PYTORCH_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME],
+            DEFAULT_IGNORE_PATTERNS + [PYTORCH_WEIGHTS_PATTERN],
+        ],
+        [  # Ignore safetensors if pytorch is preferred
+            False,
+            [PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME],
+            DEFAULT_IGNORE_PATTERNS + [SAFE_WEIGHTS_PATTERN],
+        ],
+    ])
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock,
+                                           mock_snapshot_download: MagicMock,
+                                           prefer_safetensors: bool,
+                                           repo_files: List[str],
+                                           expected_ignore_patterns: List[str]):
+    test_repo_id = 'test_repo_id'
+    mock_list_repo_files.return_value = repo_files
+
+    download_from_hf_hub(test_repo_id, prefer_safetensors=prefer_safetensors)
+    mock_snapshot_download.assert_called_once_with(
+        test_repo_id,
+        cache_dir=None,
+        ignore_patterns=expected_ignore_patterns,
+        token=None,
+    )
+
+
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_no_weights(
+    mock_list_repo_files: MagicMock,
+    mock_snapshot_download: MagicMock,
+):
+    test_repo_id = 'test_repo_id'
+    mock_list_repo_files.return_value = []
+
+    with pytest.raises(ValueError):
+        download_from_hf_hub(test_repo_id)
+
+    mock_snapshot_download.assert_not_called()
+
+
+@pytest.mark.parametrize(['exception', 'expected_attempts'], [
+    [requests.exceptions.RequestException(), 3],
+    [RepositoryNotFoundError(''), 1],
+    [ValueError(), 1],
+])
+@mock.patch('tenacity.nap.time.sleep')
+@mock.patch('huggingface_hub.snapshot_download')
+@mock.patch('huggingface_hub.list_repo_files')
+def test_download_from_hf_hub_retry(
+    mock_list_repo_files: MagicMock,
+    mock_snapshot_download: MagicMock,
+    mock_sleep: MagicMock,  # so the retry wait doesn't actually wait
+    exception: BaseException,
+    expected_attempts: int,
+):
+    mock_list_repo_files.return_value = [SAFE_WEIGHTS_INDEX_NAME]
+    mock_snapshot_download.side_effect = exception
+
+    with pytest.raises((tenacity.RetryError, exception.__class__)):
+        download_from_hf_hub('test_repo_id')
+
+    assert mock_snapshot_download.call_count == expected_attempts
+
+
+# ======================== download_from_cache_server tests ========================
+
+ROOT_HTML = b"""
+<!DOCTYPE html>
+<html>
+<body>
+    <ul>
+        <li><a href="file1">file1</a></li>
+        <li><a href="folder/">folder/</a></li>
+    </ul>
+</body>
+</html>
+"""
+
+SUBFOLDER_HTML = b"""
+<!DOCTYPE html>
+<html>
+<body>
+    <ul>
+        <li><a href="file2">file2</a></li>
+    </ul>
+</body>
+</html>
+"""
+
+
+@mock.patch.object(requests.Session, 'get')
+@mock.patch('os.makedirs')
+@mock.patch('builtins.open')
+def test_download_from_cache_server(mock_open: MagicMock,
+                                    mock_makedirs: MagicMock,
+                                    mock_get: MagicMock):
+    cache_url = 'https://cache.com/'
+    model_name = 'model'
+    formatted_model_name = 'models--model'
+    save_dir = 'save_dir/'
+
+    mock_open.return_value = MagicMock()
+
+    def _server_response(url: str, **kwargs: Dict[str, Any]):
+        if url == urljoin(cache_url, f'{formatted_model_name}/blobs/'):
+            return MagicMock(status_code=HTTPStatus.OK, content=ROOT_HTML)
+        if url == urljoin(cache_url, f'{formatted_model_name}/blobs/file1'):
+            return MagicMock(status_code=HTTPStatus.OK)
+        elif url == urljoin(cache_url, f'{formatted_model_name}/blobs/folder/'):
+            return MagicMock(status_code=HTTPStatus.OK, content=SUBFOLDER_HTML)
+        elif url == urljoin(cache_url,
+                            f'{formatted_model_name}/blobs/folder/file2'):
+            return MagicMock(status_code=HTTPStatus.OK)
+        else:
+            return MagicMock(status_code=HTTPStatus.NOT_FOUND)
+
+    mock_get.side_effect = _server_response
+    download_from_cache_server(model_name, cache_url, 'save_dir/')
+
+    mock_open.assert_has_calls([
+        mock.call(os.path.join(save_dir, formatted_model_name, 'blobs/file1'),
+                  'wb'),
+        mock.call(
+            os.path.join(save_dir, formatted_model_name, 'blobs/folder/file2'),
+            'wb'),
+    ],
+                               any_order=True)
+
+
+@mock.patch.object(requests.Session, 'get')
+def test_download_from_cache_server_unauthorized(mock_get: MagicMock):
+    cache_url = 'https://cache.com/'
+    model_name = 'model'
+    save_dir = 'save_dir/'
+
+    mock_get.return_value = MagicMock(status_code=HTTPStatus.UNAUTHORIZED)
+    with pytest.raises(PermissionError):
+        download_from_cache_server(model_name, cache_url, save_dir)
+
+
+@pytest.mark.parametrize(['exception', 'expected_attempts'], [
+    [requests.exceptions.RequestException(), 3],
+    [PermissionError(), 1],
+    [ValueError(), 1],
+])
+@mock.patch('tenacity.nap.time.sleep')
+@mock.patch('llmfoundry.utils.model_download_utils._recursive_download')
+def test_download_from_cache_server_retry(
+    mock_recursive_download: MagicMock,
+    mock_sleep: MagicMock,  # so the retry wait doesn't actually wait
+    exception: BaseException,
+    expected_attempts: int,
+):
+    mock_recursive_download.side_effect = exception
+
+    with pytest.raises((tenacity.RetryError, exception.__class__)):
+        download_from_cache_server('model', 'cache_url', 'save_dir')
diff --git a/tests/test_mpt_gen.py b/tests/test_mpt_gen.py
index 06ddccd479..9f022ef487 100644
--- a/tests/test_mpt_gen.py
+++ b/tests/test_mpt_gen.py
@@ -1,19 +1,21 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
-from unittest.mock import patch
+from typing import Callable, List, Optional, Tuple
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
+from composer import Trainer
+from composer.callbacks import Generate as ComposerGenerate
 from composer.core.precision import get_precision_context
-from composer.utils import dist, get_device, reproducibility
-from omegaconf import DictConfig
+from composer.utils import dist, get_device
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
 
-from llmfoundry import COMPOSER_MODEL_REGISTRY
-from llmfoundry.models.mpt.modeling_mpt import MPTForCausalLM
-from llmfoundry.utils import build_tokenizer
+from llmfoundry.models.mpt.modeling_mpt import (ComposerMPTCausalLM,
+                                                MPTForCausalLM)
 
 EOS_TOKEN_ID = 0
 
@@ -53,46 +55,140 @@ def forward(
 @pytest.mark.gpu
 @pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
 @pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 @patch('llmfoundry.models.mpt.modeling_mpt.MPTForCausalLM',
        new=MockMPTForCausalLM)
-def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool):
+def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool,
+                                tie_word_embeddings: bool,
+                                build_tiny_mpt: Callable[...,
+                                                         ComposerMPTCausalLM],
+                                mpt_tokenizer: PreTrainedTokenizerBase):
     """Tests mpt generation with mutiple gpus.
 
     and generations of different lengths.
     """
-    composer_device = get_device('gpu')
-    dist.initialize_dist(composer_device)
-    reproducibility.seed_all(42)
-
-    model_config = DictConfig({
-        'name': 'mpt_causal_lm',
-        'd_model': 128,
-        'n_heads': 4,
-        'n_layers': 2,
-        'expansion_ratio': 2,
-        'no_bias': False,
-        'use_cache': True,
-        'attn_config': {
+    device = get_device('gpu')
+
+    model = build_tiny_mpt(
+        tie_word_embeddings=tie_word_embeddings,
+        attn_config={
             'attn_impl': attn_impl,
             'attn_uses_sequence_id': False,
             'alibi': use_alibi
         },
-    })
-
-    # build tokenizer
-    tokenizer = build_tokenizer('EleutherAI/gpt-neox-20b', {})
+    )
+    model = device.module_to_device(model)
 
-    # build model
-    model = COMPOSER_MODEL_REGISTRY[model_config.name](model_config, tokenizer)
-    model = composer_device.module_to_device(model)
     model.eval()
 
     model.model = FSDP(model.model)
 
     with get_precision_context('amp_bf16'):
-        _ = model.generate(composer_device.tensor_to_device(
-            tokenizer('hello', return_tensors='pt')['input_ids']),
+        _ = model.generate(device.tensor_to_device(
+            mpt_tokenizer('hello', return_tensors='pt')['input_ids']),
                            max_new_tokens=3,
                            eos_token_id=EOS_TOKEN_ID,
                            use_cache=True,
                            synced_gpus=True)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
+@pytest.mark.parametrize('use_alibi', [True, False])
+def test_mpt_generate_callback(attn_impl: str, use_alibi: bool,
+                               build_tiny_mpt: Callable[...,
+                                                        ComposerMPTCausalLM],
+                               tiny_ft_dataloader: DataLoader):
+    device = get_device('gpu')
+
+    # build mpt model
+    model = build_tiny_mpt(
+        tie_word_embeddings=True,
+        attn_config={
+            'attn_impl': attn_impl,
+            'attn_uses_sequence_id': False,
+            'alibi': use_alibi
+        },
+    )
+    model = device.module_to_device(model)
+
+    # generate callback
+    prompts = [
+        'The best banana bread recipe is',
+        '2+2=',
+        'how much wood could a woodchuck chuck',
+    ]
+    gen_interval = 1
+    generate = ComposerGenerate(
+        prompts,
+        interval=f'{gen_interval}ba',
+        max_new_tokens=5,
+        batch_size=len(prompts),
+        use_cache=True,
+    )
+    generate.generate = Mock(wraps=generate.generate, autospec=True)
+
+    # build trainer
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        device=device,
+        max_duration=f'{gen_interval}ba',
+        callbacks=[generate],
+    )
+    trainer.logger.log_table = Mock()
+    trainer.fit()
+
+    generate.generate.assert_called_once()
+    trainer.logger.log_table.assert_called_once()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('attn_impl', ['triton', 'torch'])
+@pytest.mark.parametrize('use_alibi', [True, False])
+def test_mpt_generate_callback_not_tied(
+        use_alibi: bool, attn_impl: str,
+        build_tiny_mpt: Callable[..., ComposerMPTCausalLM],
+        tiny_ft_dataloader: DataLoader):
+    device = get_device('gpu')
+
+    # build mpt model
+    model = build_tiny_mpt(
+        tie_word_embeddings=False,
+        attn_config={
+            'attn_impl': attn_impl,
+            'attn_uses_sequence_id': False,
+            'alibi': use_alibi,
+        },
+    )
+    model = device.module_to_device(model)
+
+    # generate callback
+    prompts = [
+        'The best banana bread recipe is',
+        '2+2=',
+        'how much wood could a woodchuck chuck',
+    ]
+    gen_interval = 1
+    generate = ComposerGenerate(
+        prompts,
+        interval=f'{gen_interval}ba',
+        max_new_tokens=5,
+        batch_size=len(prompts),
+        use_cache=True,
+    )
+    generate.generate = Mock(wraps=generate.generate, autospec=True)
+
+    # build trainer
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        device=device,
+        max_duration=f'{gen_interval}ba',
+        callbacks=[generate],
+    )
+    trainer.logger.log_table = Mock()
+    trainer.fit()
+
+    generate.generate.assert_called_once()
+    trainer.logger.log_table.assert_called_once()
diff --git a/tests/test_onnx.py b/tests/test_onnx.py
index 4ccb8e4112..becd3c773f 100644
--- a/tests/test_onnx.py
+++ b/tests/test_onnx.py
@@ -3,8 +3,8 @@
 
 import pathlib
 
+import pytest
 import torch
-from composer.utils import reproducibility
 from transformers import AutoModelForCausalLM
 
 from llmfoundry import MPTConfig, MPTForCausalLM
@@ -26,8 +26,8 @@ def gen_random_batch(batch_size: int, vocab_size: int, max_seq_len: int):
     return batch
 
 
-def test_onnx_export(tmp_path: pathlib.Path):
-    reproducibility.seed_all(42)
+@pytest.mark.parametrize('tie_word_embeddings', [True, False])
+def test_onnx_export(tie_word_embeddings: bool, tmp_path: pathlib.Path):
     from transformers.models.auto.configuration_auto import CONFIG_MAPPING
     CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
     AutoModelForCausalLM.register(MPTConfig, MPTForCausalLM)
@@ -50,6 +50,7 @@ def test_onnx_export(tmp_path: pathlib.Path):
         use_cache=True,
         vocab_size=vocab_size,
         norm_type='layernorm',
+        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()
diff --git a/tests/test_packing.py b/tests/test_packing.py
new file mode 100644
index 0000000000..cbeca8b7b1
--- /dev/null
+++ b/tests/test_packing.py
@@ -0,0 +1,191 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from composer.utils import dist, reproducibility
+from omegaconf import DictConfig
+from pytest import approx
+from torch.utils.data import DataLoader
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def _data_to_batch(data: List[List[int]], max_seq_len: int,
+                   pad_token_id: int) -> Dict[str, torch.Tensor]:
+    """Helper function to create a proper batch of data."""
+    input_ids = torch.stack([
+        torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data
+    ])
+
+    attention_mask = torch.stack([
+        torch.tensor([1] * len(d) + [pad_token_id] * (max_seq_len - len(d)))
+        for d in data
+    ])
+    return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+def test_packing():
+    """Tests that packing works for a single batch."""
+    pad_token_id = 0
+    max_seq_len = 5
+    packer = BinPackCollator(collator=lambda x: x,
+                             target_batch_size=2,
+                             max_seq_len=max_seq_len,
+                             pad_token_id=pad_token_id,
+                             padding_side='right')
+
+    batch = _data_to_batch([
+        [1],
+        [2] * 2,
+        [4] * 4,
+        [3] * 3,
+    ], max_seq_len, pad_token_id)
+
+    packed_samples = packer.pack(batch)
+
+    assert torch.equal(packed_samples['input_ids'],
+                       torch.Tensor([[3, 3, 3, 2, 2], [4, 4, 4, 4, 1]]))
+    assert torch.all(packed_samples['attention_mask'] == 1)
+
+
+def test_packing_with_leftovers():
+    """Tests that packing handles leftovers and computes waste correctly."""
+    pad_token_id = 0
+    max_seq_len = 5
+    packer = BinPackCollator(collator=lambda x: x,
+                             target_batch_size=2,
+                             max_seq_len=max_seq_len,
+                             pad_token_id=pad_token_id,
+                             padding_side='right')
+
+    batch = _data_to_batch([
+        [1],
+        [2] * 2,
+        [4] * 4,
+        [4] * 4,
+    ], max_seq_len, pad_token_id)
+
+    packed_batch = packer.pack(batch)
+
+    assert torch.equal(packed_batch['input_ids'],
+                       torch.Tensor([[4, 4, 4, 4, 1], [4, 4, 4, 4, 0]]))
+    assert torch.equal(packed_batch['attention_mask'],
+                       torch.Tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]))
+
+    # Check leftovers and waste.
+    assert len(packer._leftover_bins) == 1
+    leftover_size, leftover = packer._leftover_bins[0]
+    assert leftover_size == 2
+    assert torch.equal(leftover['input_ids'], torch.Tensor([2, 2]))
+    assert torch.equal(leftover['attention_mask'], torch.Tensor([1, 1]))
+    assert packer.waste == approx(2 / 11)  # 2 tokens wasted of 11 tokens total
+
+    # Ensure that leftovers are used in the next batch if possible.
+    batch = _data_to_batch([[1]], max_seq_len, pad_token_id)
+    packed_batch = packer.pack(batch)
+    assert torch.equal(packed_batch['input_ids'],
+                       torch.Tensor([[2, 2, 0, 0, 0], [1, 0, 0, 0, 0]]))
+    assert torch.equal(packed_batch['attention_mask'],
+                       torch.Tensor([[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]]))
+
+
+@patch('llmfoundry.data.packing.profile_packing')
+def test_auto_packing(profile_packing: Mock):
+    """Tests that auto packing selects the highest packing ratio with zero.
+
+    waste.
+    """
+    # List of tuples of packing_ratio, padding, waste, sorted by packing ratio
+    profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)]
+
+    packing_ratio = auto_packing_ratio(
+        dataloader_cfg=DictConfig({'dataset': {
+            'max_seq_len': 2048
+        }}),
+        tokenizer=None,
+        device_batch_size=1,
+    )  # Dummy values, profiling results are already set.
+
+    # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0.
+    assert packing_ratio == 2
+
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+@patch('llmfoundry.data.packing.profile_packing')
+def test_dist_auto_packing(profile_packing: Mock):
+    """Tests that auto packing works with world size > 1."""
+    dist.initialize_dist('gpu')
+
+    # List of tuples of packing_ratio, padding, waste, sorted by packing ratio
+    if dist.get_global_rank() == 0:
+        profile_packing.return_value = [(1, .9, 0), (2, .8, 0),
+                                        (3, .7, 0)]  # should pick 3
+    else:
+        profile_packing.return_value = [(1, .9, 0), (2, .8, 0),
+                                        (3, .7, .5)]  # should pick 2
+
+    packing_ratio = auto_packing_ratio(
+        dataloader_cfg=DictConfig({'dataset': {
+            'max_seq_len': 2048
+        }}),
+        tokenizer=None,
+        device_batch_size=1,
+    )  # Dummy values, profiling results are already set.
+
+    # auto packing ratio should choose 2 because it's the minimum between ranks.
+    assert packing_ratio == 2
+
+
+@pytest.mark.parametrize('packing_ratio', ['auto', 2.0])
+def test_packing_with_dataloader(packing_ratio: Any):
+    """Tests that packing works with a dataloader."""
+    reproducibility.seed_all(17)
+    tokenizer = build_tokenizer('gpt2', {})
+    cfg = DictConfig({
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': 'tatsu-lab/alpaca',
+            'split': 'train',
+            'max_seq_len': 2048,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': packing_ratio,
+            'shuffle': False,
+        },
+        'drop_last': False,
+        # Need to test with 0 num_workers because the packing collator object
+        # Gets copied per worker and we cannot check the waste for child processes.
+        'num_workers': 0,
+        'pin_memory': False,
+        'prefetch_factor': None,
+        'persistent_workers': False,
+        'timeout': 0,
+    })
+
+    loader = build_finetuning_dataloader(cfg, tokenizer,
+                                         device_batch_size=6).dataloader
+
+    assert isinstance(loader, DataLoader)
+    pack_collator = loader.collate_fn
+    assert isinstance(pack_collator, BinPackCollator)
+
+    batch_ix = 0
+    for _ in loader:
+        batch_ix += 1
+        if batch_ix >= 3:
+            break
+
+    padding = (1 - pack_collator.efficiency)
+    if packing_ratio == 'auto':
+        assert pack_collator.waste == approx(0)
+        assert padding == approx(0.1197916, rel=.01)
+    else:
+        assert pack_collator.waste == approx(0)
+        assert padding == approx(0.873720, rel=.01)
diff --git a/tests/test_rope_dail_vs_hf.py b/tests/test_rope_dail_vs_hf.py
new file mode 100644
index 0000000000..598e308546
--- /dev/null
+++ b/tests/test_rope_dail_vs_hf.py
@@ -0,0 +1,145 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+from composer.core.precision import get_precision_context
+from omegaconf import OmegaConf as om
+
+from llmfoundry.models.layers.attention import is_flash_v2_installed
+from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('clip_qkv', [True, False])
+@pytest.mark.parametrize('qk_ln', [True, False])
+@pytest.mark.parametrize(
+    'attn_type',
+    ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
+@pytest.mark.parametrize('seq_len', [1, 233, 2048])
+def test_rope_dail_vs_hf(clip_qkv: bool,
+                         qk_ln: bool,
+                         attn_type: str,
+                         seq_len: int,
+                         device: str = 'cuda'):
+    # compare rope rotations for the dail vs hf implementations
+    if not is_flash_v2_installed():
+        pytest.skip('dail implementation of rope requires flash attention 2.')
+
+    from llmfoundry.models.layers import attention
+
+    cfg = om.create({
+        'attn_impl': 'flash',
+        'd_model': 128,
+        'n_heads': 4,
+        'attn_pdrop': 0,
+        'clip_qkv': clip_qkv,
+        'qk_ln': qk_ln,
+    })
+
+    batch_size = 2
+    assert cfg.d_model % cfg.n_heads == 0
+    if attn_type == 'grouped_query_attention':
+        cfg.kv_n_heads = 2
+
+    attn0 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+    attn1 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
+
+    attn1.load_state_dict(attn0.state_dict())
+    x0 = torch.randn(batch_size, seq_len, cfg.d_model).to(device)
+    x1 = x0.clone().detach()
+    x0.requires_grad = True
+    x1.requires_grad = True
+    attention_mask = torch.ones(batch_size, seq_len).to(device).bool()
+
+    with get_precision_context('amp_bf16'):
+        dail_rope_config = {
+            'rope_theta': 10000,
+            'rope_impl': 'dail',
+            'rope_dail_config': {
+                'type': 'original',
+                'pos_idx_in_fp32': True,
+                'xpos_scale_base': 512,
+            }
+        }
+        hf_rope_config = {
+            'rope_theta': 10000,
+            'rope_impl': 'hf',
+            'rope_hf_config': {
+                'type': 'no_scaling',
+                'factor': 1.0,
+            }
+        }
+
+        dail_rope = gen_rotary_embedding(
+            rope_head_dim=cfg.d_model // cfg.n_heads,
+            rope_impl=dail_rope_config['rope_impl'],
+            rope_theta=dail_rope_config['rope_theta'],
+            rope_dail_config=dail_rope_config['rope_dail_config'],
+            rope_hf_config={},
+            max_seq_len=seq_len).to('cuda')
+        dail_rope_w_meta_info = {
+            'impl': 'dail',
+            'rotary_emb': dail_rope,
+            'offset_info': 0,
+            'seq_len': seq_len,
+        }
+
+        hf_rope = gen_rotary_embedding(
+            rope_head_dim=cfg.d_model // cfg.n_heads,
+            rope_impl=hf_rope_config['rope_impl'],
+            rope_theta=hf_rope_config['rope_theta'],
+            rope_dail_config={},
+            rope_hf_config=hf_rope_config['rope_hf_config'],
+            max_seq_len=seq_len).to('cuda')
+        pos = torch.arange(seq_len).unsqueeze(0).to(device='cuda')
+        # adjust the position indices to account for padding tokens
+        pos = torch.clamp(
+            pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1),
+            min=0,
+        )
+        hf_rope_w_meta_info = {
+            'impl': 'hf',
+            'rotary_emb': hf_rope,
+            'offset_info': pos,
+            'seq_len': seq_len,
+        }
+
+        y0, _, _ = attn0(x0,
+                         past_key_value=None,
+                         attn_bias=None,
+                         attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=dail_rope_w_meta_info,
+                         is_causal=True)
+
+        y1, _, _ = attn1(x1,
+                         past_key_value=None,
+                         attn_bias=None,
+                         attention_mask=attention_mask,
+                         rotary_emb_w_meta_info=hf_rope_w_meta_info,
+                         is_causal=True)
+
+        y0 *= attention_mask.unsqueeze(-1)
+        y1 *= attention_mask.unsqueeze(-1)
+
+        loss0 = y0.sum()
+        loss1 = y1.sum()
+
+    loss0.backward()
+    loss1.backward()
+
+    torch.testing.assert_close(y0, y1, rtol=1e-2, atol=1e-2)
+
+    torch_name_param_map = {n: p for n, p in attn1.named_parameters()}
+    for n, p in attn0.named_parameters():
+        tp = torch_name_param_map[n]
+        assert p.grad is not None
+        assert tp.grad is not None
+        torch.testing.assert_close(p, tp, rtol=1e-2, atol=1e-2)
+        # Relaxed to a l2-norm based check.
+        assert torch.norm(tp.grad - p.grad) <= 1e-2 + 1e-2 * torch.norm(p.grad)
+
+    assert x0.grad is not None
+    assert x1.grad is not None
+    # Relaxed to a l2-norm based check.
+    assert torch.norm(x0.grad - x1.grad) <= 1e-2 + 1e-2 * torch.norm(x0.grad)
diff --git a/tests/test_tiktoken.py b/tests/test_tiktoken.py
index 85ff18100b..d1568e6d2a 100644
--- a/tests/test_tiktoken.py
+++ b/tests/test_tiktoken.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pathlib
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 
 import pytest
 import transformers
@@ -49,15 +49,18 @@ def get_tokenizers_for_testing(
     encoding_name: Optional[str],
     tmp_path: pathlib.Path,
     add_bos_token: bool = False,
-    add_eos_token: bool = False
+    add_eos_token: bool = False,
+    additional_special_tokens: Optional[List[str]] = None,
 ) -> Tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']:
     tiktoken = pytest.importorskip('tiktoken')
 
     # Construction
-    wrapped_tokenizer = TiktokenTokenizerWrapper(model_name=model_name,
-                                                 encoding_name=encoding_name,
-                                                 add_bos_token=add_bos_token,
-                                                 add_eos_token=add_eos_token)
+    wrapped_tokenizer = TiktokenTokenizerWrapper(
+        model_name=model_name,
+        encoding_name=encoding_name,
+        add_bos_token=add_bos_token,
+        add_eos_token=add_eos_token,
+        additional_special_tokens=additional_special_tokens)
     if model_name is not None:
         original_tokenizer = tiktoken.encoding_for_model(model_name)
     else:
@@ -176,6 +179,10 @@ def test_tiktoken_vocab(model_name: Optional[str], encoding_name: Optional[str],
 
     didnt_match = []
     for key, value in wrapped_vocab.items():
+        # Skip checking the extra ids we pad the vocab with
+        if key.startswith('<extra_id') and key.endswith('>'):
+            continue
+
         if original_tokenizer.encode(key, allowed_special='all') == [value]:
             continue
         else:
@@ -232,3 +239,23 @@ def test_tiktoken_encode_plus(model_name: Optional[str],
         encoded_special_mask = encoded_outputs.special_tokens_mask
         assert encoded_special_mask[0] == 1
         assert encoded_special_mask[-1] == 1
+
+
+@pytest.mark.parametrize('model_name,encoding_name',
+                         MODEL_ENCODING_NAME_PARAMETRIZATION)
+def test_additional_special_tokens(model_name: Optional[str],
+                                   encoding_name: Optional[str],
+                                   tmp_path: pathlib.Path):
+    special_token_to_add = '<|im_start|>'
+    wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
+        model_name,
+        encoding_name,
+        tmp_path,
+        add_bos_token=False,
+        add_eos_token=False,
+        additional_special_tokens=[special_token_to_add])
+    encoded_outputs = wrapped_tokenizer(special_token_to_add +
+                                        ' hello')['input_ids']
+
+    assert encoded_outputs[0] == wrapped_tokenizer.vocab_size
+    assert len(encoded_outputs) == 2
diff --git a/tests/test_training.py b/tests/test_training.py
index 9d40fc2a78..214909cc28 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -6,7 +6,7 @@
 import shutil
 import sys
 from argparse import Namespace
-from typing import Any
+from typing import Any, Optional
 
 import pytest
 from composer.loggers import InMemoryLogger
@@ -114,7 +114,11 @@ def set_correct_cwd():
         os.chdir('..')
 
 
-def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
+@pytest.mark.parametrize('averages', [{
+    'core_average': ['language_understanding_lite']
+}, None])
+def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
+                        tmp_path: pathlib.Path):
     """Test training run with a small dataset."""
     dataset_name = create_c4_dataset_xsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(dataset_name, 'cpu')
@@ -155,6 +159,9 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
             ])
     })
 
+    if averages is not None:
+        test_cfg.eval_gauntlet['averages'] = averages
+
     test_cfg.icl_seq_len = 128
     test_cfg.max_duration = '1ba'
     test_cfg.eval_interval = '1ba'
@@ -167,14 +174,20 @@ def test_train_gauntlet(set_correct_cwd: Any, tmp_path: pathlib.Path):
     inmemorylogger = trainer.logger.destinations[
         0]  # pyright: ignore [reportGeneralTypeIssues]
     assert isinstance(inmemorylogger, InMemoryLogger)
-    assert 'icl/metrics/eval_gauntlet/average' in inmemorylogger.data.keys()
-    assert isinstance(inmemorylogger.data['icl/metrics/eval_gauntlet/average'],
-                      list)
-    assert len(inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1]) > 0
+
+    category_name = 'default_average' if averages is None else 'core_average'
+    assert f'icl/metrics/eval_gauntlet/{category_name}' in inmemorylogger.data.keys(
+    )
     assert isinstance(
-        inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1], tuple)
+        inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'], list)
+    assert len(inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}']
+               [-1]) > 0
+    assert isinstance(
+        inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][-1],
+        tuple)
 
-    assert inmemorylogger.data['icl/metrics/eval_gauntlet/average'][-1][-1] == 0
+    assert inmemorylogger.data[f'icl/metrics/eval_gauntlet/{category_name}'][
+        -1][-1] == 0
 
 
 def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):