From 5465db4ff9850bc12afcb6fedd5737375bfc5fec Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 12 Sep 2024 12:45:12 -0700
Subject: [PATCH 01/17] Raise DatasetTooSmall exception if canonical nodes is
 less than num samples (#1518)

Co-authored-by: Saaketh Narayan <saaketh@mosaicml.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../data_prep/convert_text_to_mds.py          |  4 +-
 llmfoundry/data/finetuning/tasks.py           | 20 ++++++++-
 llmfoundry/utils/exceptions.py                |  6 +--
 tests/data/test_dataset.py                    | 44 +++++++++++++++++++
 4 files changed, 69 insertions(+), 5 deletions(-)
 create mode 100644 tests/data/test_dataset.py

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 7c40a7e698..9a1f8a912d 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -478,7 +478,9 @@ def convert_text_to_mds(
     index_path = os.path.join(local_output_folder, 'index.json')
     with open(index_path, 'r') as index_file:
         if not json.load(index_file)['shards']:
-            raise DatasetTooSmallError()
+            raise DatasetTooSmallError(
+                reason='No shards were created when converting text to MDS.',
+            )
 
     # Write a done file with the args and object names
     write_done_file(local_output_folder, args_str, object_names)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 297962dd8a..e8f6484ef2 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -73,6 +73,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     ALLOWED_RESPONSE_KEYS,
     ChatTemplateError,
     ConsecutiveRepeatedChatRolesError,
+    DatasetTooSmallError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
     InvalidExampleTypeError,
@@ -1033,7 +1034,24 @@ def build_from_streaming(
         *args: Any,
         **kwargs: Any,
     ) -> StreamingFinetuningDataset:
-        return self.streaming_dataset_class(*args, **kwargs)
+        dataset = self.streaming_dataset_class(*args, **kwargs)
+        num_canonical_nodes = dataset.num_canonical_nodes
+        num_samples = dataset.num_samples
+        if num_canonical_nodes is None:
+            num_physical_nodes = dist.get_world_size(
+            ) // dist.get_local_world_size()
+            if num_samples < num_physical_nodes:
+                raise DatasetTooSmallError(
+                    f'{num_samples=} is less than {dist.get_world_size() // dist.get_local_world_size()}, the number of physical nodes. ',
+                )
+
+        if num_canonical_nodes is not None and num_samples < num_canonical_nodes:
+            raise DatasetTooSmallError(
+                f'{num_samples=} is less than {num_canonical_nodes=}. ' +
+                'Please check your index.json file and ensure that your dataset has been written out correctly.'
+                + 'If this was intended, reduce num_canonical_nodes.',
+            )
+        return dataset
 
 
 dataset_constructor = DatasetConstructor()
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 345a254407..68045fdaa3 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -376,9 +376,9 @@ def __init__(self, dataset_name: str, split: str) -> None:
 class DatasetTooSmallError(UserError):
     """Error thrown when the dataset is too small to be processed."""
 
-    def __init__(self) -> None:
-        message = f'Your dataset is too small and produced no complete samples during preprocessing. Please provide more data.'
-        super().__init__(message)
+    def __init__(self, reason: str) -> None:
+        message = f'Your dataset is too small and produced no complete samples or too few samples. Please provide more data. {reason}'
+        super().__init__(message, reason=reason)
 
 
 class RunTimeoutError(InternalError):
diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
new file mode 100644
index 0000000000..071c189b68
--- /dev/null
+++ b/tests/data/test_dataset.py
@@ -0,0 +1,44 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import nullcontext
+from typing import Optional
+from unittest import mock
+
+import pytest
+
+from llmfoundry.data.finetuning.tasks import dataset_constructor
+from llmfoundry.utils.exceptions import DatasetTooSmallError
+
+
+@pytest.mark.parametrize('num_canonical_nodes', [None, 8, 2])
+def test_finetuning_streaming_dataset_too_small(
+    num_canonical_nodes: Optional[int],
+):
+    num_samples = 2
+
+    class MockDataset:
+
+        def __init__(self):
+            self.num_canonical_nodes = num_canonical_nodes
+            self.num_samples = num_samples
+
+    class MockDist:
+
+        def get_world_size(self):
+            return 32
+
+        def get_local_world_size(self):
+            return 8
+
+    result_context = nullcontext(
+    ) if num_canonical_nodes == 2 else pytest.raises(DatasetTooSmallError)
+    with result_context:
+        with mock.patch(
+            'llmfoundry.data.finetuning.tasks.dist',
+            new=MockDist(),
+        ):
+            with mock.patch(
+                'llmfoundry.data.finetuning.tasks.DatasetConstructor.streaming_dataset_class',
+                new=MockDataset,
+            ):
+                dataset_constructor.build_from_streaming()

From dab768f7e27d0d725bb911a9671d147532e411bc Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 12 Sep 2024 13:09:50 -0700
Subject: [PATCH 02/17] Add permissions check for delta table reading (#1522)

---
 .../command_utils/data_prep/convert_delta_to_json.py      | 7 +++++++
 llmfoundry/utils/exceptions.py                            | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index 50d11b1222..666d0278c6 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -22,6 +22,7 @@
     ClusterDoesNotExistError,
     FailedToConnectToDatabricksError,
     FailedToCreateSQLConnectionError,
+    InsufficientPermissionsError,
 )
 
 if TYPE_CHECKING:
@@ -454,6 +455,12 @@ def fetch(
             sparkSession,
         )
     except Exception as e:
+        from pyspark.errors import AnalysisException
+        if isinstance(e, AnalysisException):
+            if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
+                raise InsufficientPermissionsError(
+                    action=f'reading from {tablename}',
+                ) from e
         raise RuntimeError(
             f'Error in get rows from {tablename}. Restart sparkSession and try again',
         ) from e
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 68045fdaa3..11895564f2 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -427,3 +427,11 @@ def __init__(
             window_size=window_size,
             loss_window=loss_window,
         )
+
+
+class InsufficientPermissionsError(UserError):
+    """Error thrown when the user does not have sufficient permissions."""
+
+    def __init__(self, action: str) -> None:
+        message = f'Insufficient permissions when {action}. Please check your permissions.'
+        super().__init__(message, action=action)

From a862d6e4d269d7f881c6b59bdc9667ef4cab5613 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 12 Sep 2024 16:52:06 -0700
Subject: [PATCH 03/17] Add HuggingFaceCheckpointer option for only registering
 final checkpoint (#1516)

---
 llmfoundry/callbacks/hf_checkpointer.py       | 160 +++++++++++++-----
 llmfoundry/command_utils/train.py             |   7 +-
 .../inference/test_convert_composer_to_hf.py  | 154 +++++++++++++++--
 3 files changed, 257 insertions(+), 64 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index f05e7322a8..4e6a501f2f 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -10,6 +10,7 @@
 import shutil
 import tempfile
 import time
+import warnings
 from multiprocessing.context import SpawnProcess
 from pathlib import Path
 from typing import Any, Optional, Sequence, Union
@@ -18,6 +19,7 @@
 import torch
 import torch.nn as nn
 from composer.core import Callback, Event, Precision, State, Time, TimeUnit
+from composer.devices import Device
 from composer.loggers import Logger, MLFlowLogger
 from composer.models import HuggingFaceModel
 from composer.utils import (
@@ -161,6 +163,10 @@ class HuggingFaceCheckpointer(Callback):
             keys ``input_example`` and ``signature``.
         flatten_imports (Sequence[str]): A sequence of import prefixes that will
             be flattened when editing MPT files.
+        final_register_only (bool): If true, only register the model in the MLFlow
+            registry on the last batch and do not save the HuggingFace checkpoint. If
+            registration fails or mlflow_registered_model_name is not set, then we will
+            fallback to saving the HuggingFace checkpoint.
     """
 
     def __init__(
@@ -173,6 +179,7 @@ def __init__(
         mlflow_registered_model_name: Optional[str] = None,
         mlflow_logging_config: Optional[dict] = None,
         flatten_imports: Sequence[str] = ('llmfoundry',),
+        final_register_only: bool = False,
     ):
         _, _, self.save_dir_format_str = parse_uri(save_folder)
         self.overwrite = overwrite
@@ -185,8 +192,18 @@ def __init__(
         self.flatten_imports = flatten_imports
         self.using_peft = False
 
-        # mlflow config setup
+        self.final_register_only = final_register_only
+
         self.mlflow_registered_model_name = mlflow_registered_model_name
+        if self.final_register_only and self.mlflow_registered_model_name is None:
+            self.final_register_only = False
+            warnings.warn(
+                'final_register_only is set to True, but mlflow_registered_model_name is not set. '
+                +
+                f'Defaulting to final_register_only=False and saving the HuggingFace checkpoint to {save_folder=}.',
+            )
+
+        # mlflow config setup
         if mlflow_logging_config is None:
             mlflow_logging_config = {}
         if self.mlflow_registered_model_name is not None:
@@ -249,7 +266,7 @@ def __init__(
         self.last_checkpoint_batch: Optional[Time] = None
         self.mlflow_loggers = []
 
-        self.child_processes: list[SpawnProcess] = []
+        self.register_processes: list[SpawnProcess] = []
         # Temporary save directory used by child_processes.
         self.temp_save_dir = None
 
@@ -259,7 +276,17 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
             state,
             event,
         ) and self.last_checkpoint_batch != state.timestamp.batch:
-            self._save_checkpoint(state, logger)
+            is_last_batch = self._is_last_batch(state)
+            self._save_checkpoint(
+                state,
+                logger,
+                register_to_mlflow=(
+                    self.mlflow_registered_model_name is not None and
+                    is_last_batch
+                ),
+                upload_to_save_folder=not self.final_register_only or
+                not is_last_batch,
+            )
         elif event == Event.INIT:
             if not isinstance(state.model, HuggingFaceModel):
                 raise ValueError(
@@ -300,7 +327,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
             # Wait for all child processes spawned by the callback to finish.
             timeout = 3600
             wait_start = time.time()
-            while not self._all_child_processes_done():
+            while not self._all_register_processes_done(state.device):
                 wait_time = time.time() - wait_start
                 if wait_time > timeout:
                     raise TimeoutError(
@@ -308,6 +335,19 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
                     )
                 time.sleep(2)
 
+            if self._any_register_processes_error(
+                state.device,
+            ) and self.final_register_only:
+                log.error(
+                    'An error occurred in one or more registration processes. Fallback to saving the HuggingFace checkpoint.',
+                )
+                self._save_checkpoint(
+                    state,
+                    logger,
+                    upload_to_save_folder=True,
+                    register_to_mlflow=False,
+                )
+
             # Clean up temporary save directory; all processes are done with it.
             if self.temp_save_dir is not None:
                 shutil.rmtree(self.temp_save_dir)
@@ -339,12 +379,23 @@ def _is_last_batch(self, state: State):
 
         return False
 
-    def _all_child_processes_done(self) -> bool:
-        not_done = any(process.is_alive() for process in self.child_processes)
-        x = torch.tensor(1 if not_done else 0).to(device='cuda')
+    def _all_register_processes_done(self, device: Device) -> bool:
+        not_done = any(
+            process.is_alive() for process in self.register_processes
+        )
+        x = device.tensor_to_device(torch.tensor(1 if not_done else 0))
         dist.all_reduce(x, reduce_operation='MAX')
         return x.item() == 0
 
+    def _any_register_processes_error(self, device: Device) -> bool:
+        has_errors = any(
+            process.exitcode is not None and process.exitcode != 0
+            for process in self.register_processes
+        )
+        x = device.tensor_to_device(torch.tensor(1 if has_errors else 0))
+        dist.all_reduce(x, reduce_operation='MAX')
+        return x.item() == 1
+
     def transform_model_and_tokenizer(
         self,
         model: PreTrainedModel,
@@ -412,7 +463,21 @@ def transform_model_pre_registration(
         """
         return model
 
-    def _save_checkpoint(self, state: State, logger: Logger):
+    def _save_checkpoint(
+        self,
+        state: State,
+        logger: Logger,
+        upload_to_save_folder: bool,
+        register_to_mlflow: bool,
+    ):
+        """Save a HuggingFace formatted checkpoint.
+
+        Args:
+            state (State): The training state.
+            logger (Logger): The logger.
+            upload_to_save_folder (bool): Whether to upload the HF checkpoint to the save folder.
+            register_to_mlflow (bool): Whether to register the model to MLFlow
+        """
         del logger  # unused
 
         self.last_checkpoint_batch = state.timestamp.batch
@@ -548,50 +613,53 @@ def tensor_hook(
                         ].base_model_name_or_path = self.pretrained_model_name
 
             log.debug('Saving Hugging Face checkpoint to disk')
-            # This context manager casts the TE extra state in io.BytesIO format to tensor format
-            # Needed for proper hf ckpt saving.
-            context_manager = te.onnx_export(
-                True,
-            ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
-            )
-            with context_manager:
-                new_model_instance.save_pretrained(temp_save_dir)
-            if original_tokenizer is not None:
-                assert isinstance(
-                    original_tokenizer,
-                    PreTrainedTokenizerBase,
-                )
-                original_tokenizer.save_pretrained(temp_save_dir)
-
-            # Only need to edit files for MPT because it has custom code
-            if new_model_instance.config.model_type == 'mpt':
-                log.debug('Editing MPT files for HuggingFace compatibility')
-                edit_files_for_hf_compatibility(
-                    temp_save_dir,
-                    self.flatten_imports,
-                )
 
-            if self.remote_ud is not None:
-                for filename in os.listdir(temp_save_dir):
-                    remote_file_name = os.path.join(save_dir, filename)
-                    remote_file_uri = self.remote_ud.remote_backend.get_uri(
-                        remote_file_name,
-                    )
-                    log.info(
-                        f'Uploading HuggingFace formatted checkpoint to {remote_file_uri}',
+            if upload_to_save_folder:
+                # This context manager casts the TE extra state in io.BytesIO format to tensor format
+                # Needed for proper hf ckpt saving.
+                context_manager = te.onnx_export(
+                    True,
+                ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
+                )
+                with context_manager:
+                    new_model_instance.save_pretrained(temp_save_dir)
+                if original_tokenizer is not None:
+                    assert isinstance(
+                        original_tokenizer,
+                        PreTrainedTokenizerBase,
                     )
-                    self.remote_ud.upload_file(
-                        state=state,
-                        remote_file_name=remote_file_name,
-                        file_path=Path(os.path.join(temp_save_dir, filename)),
-                        overwrite=self.overwrite,
+                    original_tokenizer.save_pretrained(temp_save_dir)
+
+                # Only need to edit files for MPT because it has custom code
+                if new_model_instance.config.model_type == 'mpt':
+                    log.debug('Editing MPT files for HuggingFace compatibility')
+                    edit_files_for_hf_compatibility(
+                        temp_save_dir,
+                        self.flatten_imports,
                     )
 
+                if self.remote_ud is not None:
+                    for filename in os.listdir(temp_save_dir):
+                        remote_file_name = os.path.join(save_dir, filename)
+                        remote_file_uri = self.remote_ud.remote_backend.get_uri(
+                            remote_file_name,
+                        )
+                        log.info(
+                            f'Uploading HuggingFace formatted checkpoint to {remote_file_uri}',
+                        )
+                        self.remote_ud.upload_file(
+                            state=state,
+                            remote_file_name=remote_file_name,
+                            file_path=Path(
+                                os.path.join(temp_save_dir, filename),
+                            ),
+                            overwrite=self.overwrite,
+                        )
+
         dist.barrier()
 
         if dist.get_global_rank() == 0:
-            if self.mlflow_registered_model_name and self._is_last_batch(state):
-
+            if register_to_mlflow:
                 new_model_instance = self.transform_model_pre_registration(
                     new_model_instance,
                 )
@@ -680,7 +748,7 @@ def tensor_hook(
                     # Restore the monitor process.
                     if monitor_process is not None:
                         mlflow_logger.monitor_process = monitor_process  # type: ignore
-                    self.child_processes.append(process)
+                    self.register_processes.append(process)
 
                     # Save the temporary directory to be cleaned up later.
                     if use_temp_dir:
diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 73fa4c8d5a..14b7980d57 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -584,7 +584,12 @@ def train(cfg: DictConfig) -> Trainer:
             )
 
         hf_checkpointer_callback = hf_checkpointer_callbacks[0]
-        hf_checkpointer_callback._save_checkpoint(trainer.state, trainer.logger)
+        hf_checkpointer_callback._save_checkpoint(
+            trainer.state,
+            trainer.logger,
+            upload_to_save_folder=True,
+            register_to_mlflow=True,
+        )
         return trainer
 
     if train_cfg.only_composer_checkpoint:
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index b863e1d0a8..4f1bd63c62 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -9,6 +9,7 @@
 import shutil
 from argparse import Namespace
 from typing import Any, Callable, Optional, cast
+from unittest import mock
 from unittest.mock import ANY, MagicMock, patch
 
 import catalogue
@@ -314,9 +315,15 @@ class MockSpawnProcess:
     multiprocessing, so we need to patch SpawnProcess for tests.
     """
 
-    def __init__(self, target: Callable, kwargs: dict[str, Any]):
+    def __init__(
+        self,
+        target: Callable,
+        kwargs: dict[str, Any],
+        exitcode: int = 0,
+    ):
         self.target = target
         self.kwargs = kwargs
+        self.exitcode = exitcode
 
     def start(self):
         self.target(**self.kwargs)
@@ -325,6 +332,133 @@ def is_alive(self) -> bool:
         return False
 
 
+def _create_mlflow_logger_mock() -> MagicMock:
+    mlflow_logger_mock = MagicMock(spec=MLFlowLogger)
+    mlflow_logger_mock.state_dict = lambda *args, **kwargs: {}
+    mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock)
+    mlflow_logger_mock.register_model_with_run_id = MagicMock()
+    mlflow_logger_mock.model_registry_prefix = ''
+    mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
+    mlflow_logger_mock._run_id = 'mlflow-run-id'
+    mlflow_logger_mock._enabled = True
+    mlflow_logger_mock.run_url = 'fake-url'
+    return mlflow_logger_mock
+
+
+def _create_optimizer(original_model: torch.nn.Module) -> torch.optim.Optimizer:
+    optimizer_config = _OPTIMIZER_CFG()
+    optimizer_name = optimizer_config.pop('name')
+    return build_optimizer(
+        original_model,
+        optimizer_name,
+        optimizer_config,
+    )
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('mlflow_registry_error', [True, False])
+@pytest.mark.parametrize(
+    'mlflow_registered_model_name',
+    [None, 'dummy-registered-name'],
+)
+@patch('os.cpu_count', MagicMock(return_value=1))
+@patch(
+    'llmfoundry.callbacks.hf_checkpointer.SpawnProcess',
+    new=MockSpawnProcess,
+)
+def test_final_register_only(
+    mlflow_registry_error: bool,
+    mlflow_registered_model_name: Optional[str],
+    tiny_ft_dataloader: DataLoader,
+    tmp_path: pathlib.Path,
+    build_tiny_mpt: Callable,
+):
+    if mlflow_registry_error and mlflow_registered_model_name is None:
+        pytest.skip(
+            'Cannot test mlflow_registry_error without mlflow_registered_model_name',
+        )
+
+    delete_transformers_cache()
+
+    dist.initialize_dist(get_device('gpu'))
+
+    precision_str = 'bfloat16'
+
+    checkpointer_callback = HuggingFaceCheckpointer(
+        save_folder=os.path.join(tmp_path, 'checkpoints'),
+        save_interval='1dur',
+        precision=precision_str,
+        mlflow_registered_model_name=mlflow_registered_model_name,
+        final_register_only=True,
+    )
+
+    original_model = build_tiny_mpt()
+
+    optimizer = _create_optimizer(original_model)
+
+    mlflow_logger_mock = _create_mlflow_logger_mock()
+
+    checkpointer_callback._save_checkpoint = MagicMock(
+        wraps=checkpointer_callback._save_checkpoint,
+    )
+    trainer = Trainer(
+        model=original_model,
+        device='gpu',
+        train_dataloader=tiny_ft_dataloader,
+        max_duration='1ba',
+        callbacks=[checkpointer_callback],
+        loggers=[mlflow_logger_mock],
+        optimizers=optimizer,
+        save_latest_filename=None,
+    )
+
+    with mock.patch(
+        'llmfoundry.callbacks.hf_checkpointer.SpawnProcess',
+        new=lambda target,
+        kwargs: MockSpawnProcess(
+            target,
+            kwargs,
+            exitcode=1 if mlflow_registry_error else 0,
+        ),
+    ):
+        trainer.fit()
+
+    if mlflow_registered_model_name is not None:
+        # We should always attempt to register the model once
+        assert mlflow_logger_mock.register_model_with_run_id.call_count == 1
+        if mlflow_registry_error:
+            # If the registry fails, we should still save the model
+            assert mlflow_logger_mock.register_model_with_run_id.call_count == 1
+            assert checkpointer_callback._save_checkpoint.call_count == 2
+            assert checkpointer_callback._save_checkpoint.call_args_list[
+                0].kwargs == {
+                    'register_to_mlflow': True,
+                    'upload_to_save_folder': False,
+                }
+            assert checkpointer_callback._save_checkpoint.call_args_list[
+                1].kwargs == {
+                    'register_to_mlflow': False,
+                    'upload_to_save_folder': True,
+                }
+        else:
+            # No mlflow_registry_error, so we should only register the model
+            assert checkpointer_callback._save_checkpoint.call_count == 1
+            assert checkpointer_callback._save_checkpoint.call_args_list[
+                0].kwargs == {
+                    'register_to_mlflow': True,
+                    'upload_to_save_folder': False,
+                }
+    else:
+        # No mlflow_registered_model_name, so we should only save the checkpoint
+        assert mlflow_logger_mock.register_model_with_run_id.call_count == 0
+        assert checkpointer_callback._save_checkpoint.call_count == 1
+        assert checkpointer_callback._save_checkpoint.call_args_list[
+            0].kwargs == {
+                'register_to_mlflow': False,
+                'upload_to_save_folder': True,
+            }
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize('log_to_mlflow', [True, False])
 @pytest.mark.parametrize(
@@ -368,23 +502,9 @@ def test_huggingface_conversion_callback_interval(
 
     original_model = build_tiny_mpt()
 
-    optimizer_config = _OPTIMIZER_CFG()
-    optimizer_name = optimizer_config.pop('name')
-    optimizer = build_optimizer(
-        original_model,
-        optimizer_name,
-        optimizer_config,
-    )
+    optimizer = _create_optimizer(original_model)
 
-    mlflow_logger_mock = MagicMock(spec=MLFlowLogger)
-    mlflow_logger_mock.state_dict = lambda *args, **kwargs: {}
-    mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock)
-    mlflow_logger_mock.register_model_with_run_id = MagicMock()
-    mlflow_logger_mock.model_registry_prefix = ''
-    mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
-    mlflow_logger_mock._run_id = 'mlflow-run-id'
-    mlflow_logger_mock._enabled = True
-    mlflow_logger_mock.run_url = 'fake-url'
+    mlflow_logger_mock = _create_mlflow_logger_mock()
     checkpointer_callback.transform_model_pre_registration = MagicMock(
         wraps=checkpointer_callback.transform_model_pre_registration,
     )

From 83ab9c30e0a2432bcc6213e4cb8b55296b13e438 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Mon, 16 Sep 2024 13:54:10 -0700
Subject: [PATCH 04/17] Replace FSDP args (#1517)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/command_utils/eval.py                         | 8 ++++++--
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 +++--
 tests/models/hf/test_fsdp_weight_tying.py                | 2 +-
 tests/models/hf/test_hf_peft_wrapping.py                 | 2 +-
 tests/models/test_fsdp_act_checkpoint.py                 | 2 +-
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index f622ca182d..eca16bd815 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -52,7 +52,7 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    fsdp_config: Optional[dict[str, Any]],
+    parallelism_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -99,6 +99,10 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
+    fsdp_config = parallelism_config.get(
+        'fsdp_config',
+        None,
+    ) if parallelism_config else None
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -316,7 +320,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
              device_eval_batch_size=eval_config.device_eval_batch_size,
              eval_gauntlet_config=eval_gauntlet_config,
              eval_loader_config=eval_loader_config,
-             fsdp_config=fsdp_config,
+             parallelism_config={'fsdp': fsdp_config},
              loggers=loggers,
              python_log_level=eval_config.python_log_level,
              precision=eval_config.precision,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 4f1bd63c62..66ec739a65 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
+        parallelism_config={'fsdp': fsdp_config}
+        if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 69ced673a1..8e6c113169 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 56cb36c8c1..01acc22a60 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index a41574538a..366bcf7786 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
     )
 
     assert trainer.state.fsdp_enabled

From 0114f33da83b5e2c43f6399f69acd8401525a9e8 Mon Sep 17 00:00:00 2001
From: Abhay Gupta <gupta-abhay@users.noreply.github.com>
Date: Mon, 16 Sep 2024 17:09:12 -0700
Subject: [PATCH 05/17] enable correct padding_idx for embedding layers (#1527)

---
 llmfoundry/models/mpt/modeling_mpt.py     |  1 +
 llmfoundry/models/utils/param_init_fns.py |  3 +++
 tests/models/utils/test_param_init_fns.py | 27 +++++++++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 06b64101c3..cfe1172634 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -396,6 +396,7 @@ def __init__(self, config: MPTConfig):
         self.wte = SharedEmbedding(
             config.vocab_size,
             config.d_model,
+            padding_idx=config.pad_token_id,
             device=config.init_device,
         )
         if self.learned_pos_emb:
diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py
index 180e7b894c..8ad6e77c57 100644
--- a/llmfoundry/models/utils/param_init_fns.py
+++ b/llmfoundry/models/utils/param_init_fns.py
@@ -224,6 +224,9 @@ def embedding_init(
             emb_init_fn_ = init_fn_
 
         emb_init_fn_(module.weight)
+        if module.padding_idx is not None:
+            with torch.no_grad():
+                module.weight[module.padding_idx].fill_(0)
 
         return True
 
diff --git a/tests/models/utils/test_param_init_fns.py b/tests/models/utils/test_param_init_fns.py
index 0eaf60c869..11d9fba430 100644
--- a/tests/models/utils/test_param_init_fns.py
+++ b/tests/models/utils/test_param_init_fns.py
@@ -199,3 +199,30 @@ def test_emb_init(emb_init_cfg: Optional[tuple[str, Union[int, list[int]]]]):
                 emb_init_uniform_lim,
             ) == 2 and emb_init_uniform_lim[0] == emb_init_uniform_lim[1]:
                 assert (model.emb.weight == emb_init_uniform_lim[0]).all()
+
+
+@pytest.mark.parametrize(
+    'padding_idx',
+    [0, 2],
+)
+def test_emb_padding_init(padding_idx: int,):
+    cfg: dict[str, Union[int, list[int]]] = {
+        'vocab_size': 64,
+        'in_features': 16,
+        'n_layers': 2,
+        'padding_idx': padding_idx,
+        'emb_init_std': 5,
+    }
+    dict_cfg = om.create(cfg)
+
+    model = nn.Embedding(
+        dict_cfg.vocab_size,
+        dict_cfg.in_features,
+        dict_cfg.padding_idx,
+    )
+
+    model.apply(partial(param_init_fns.get('kaiming_normal_'), **dict_cfg))
+    assert isinstance(model, torch.nn.Embedding)
+
+    if dict_cfg.get('emb_init_std') is not None:
+        assert (model.weight[padding_idx] == 0).all()

From 9a1b78b128a242590b00f364a99d2d2d735f9468 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Tue, 17 Sep 2024 10:29:09 -0700
Subject: [PATCH 06/17] Revert "Replace FSDP args" (#1533)

---
 llmfoundry/command_utils/eval.py                         | 8 ++------
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 ++---
 tests/models/hf/test_fsdp_weight_tying.py                | 2 +-
 tests/models/hf/test_hf_peft_wrapping.py                 | 2 +-
 tests/models/test_fsdp_act_checkpoint.py                 | 2 +-
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index eca16bd815..f622ca182d 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -52,7 +52,7 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    parallelism_config: Optional[dict[str, Any]],
+    fsdp_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -99,10 +99,6 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
-    fsdp_config = parallelism_config.get(
-        'fsdp_config',
-        None,
-    ) if parallelism_config else None
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -320,7 +316,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
              device_eval_batch_size=eval_config.device_eval_batch_size,
              eval_gauntlet_config=eval_gauntlet_config,
              eval_loader_config=eval_loader_config,
-             parallelism_config={'fsdp': fsdp_config},
+             fsdp_config=fsdp_config,
              loggers=loggers,
              python_log_level=eval_config.python_log_level,
              precision=eval_config.precision,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 66ec739a65..4f1bd63c62 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,8 +1042,7 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        parallelism_config={'fsdp': fsdp_config}
-        if fsdp_state_dict_type is not None else None,
+        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1470,7 +1469,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 8e6c113169..69ced673a1 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 01acc22a60..56cb36c8c1 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index 366bcf7786..a41574538a 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
     )
 
     assert trainer.state.fsdp_enabled

From 7a23f60ad5ce25e80c3d5f3ab3badfb413743daa Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Tue, 17 Sep 2024 12:54:28 -0700
Subject: [PATCH 07/17] Delete unneeded inner base model in PEFT HF
 Checkpointer (#1532)

---
 llmfoundry/callbacks/hf_checkpointer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 4e6a501f2f..65bdcb3b6c 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -585,6 +585,7 @@ def tensor_hook(
                         new_base_model_instance,
                         original_model.peft_config[active_adapter],
                     )
+                    del new_base_model_instance
                 else:
                     new_model_instance = type(original_model)(new_config)
                     new_model_instance.generation_config.update(

From 2e3d14f6130ebad5a149c1c52f53fd07628e1006 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Tue, 17 Sep 2024 13:45:04 -0700
Subject: [PATCH 08/17] Add deprecation warning to fsdp_config (#1530)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/command_utils/eval.py              |  35 ++++-
 .../inference/test_convert_composer_to_hf.py  |   5 +-
 tests/eval/test_eval_deprecation.py           | 125 ++++++++++++++++++
 tests/models/hf/test_fsdp_weight_tying.py     |   2 +-
 tests/models/hf/test_hf_peft_wrapping.py      |   2 +-
 tests/models/test_fsdp_act_checkpoint.py      |   2 +-
 6 files changed, 163 insertions(+), 8 deletions(-)
 create mode 100644 tests/eval/test_eval_deprecation.py

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index f622ca182d..e644ad1f0f 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import time
+import warnings
 from typing import Any, Optional, Union
 
 import pandas as pd
@@ -11,7 +12,7 @@
 from composer.core import Callback
 from composer.loggers.logger_destination import LoggerDestination
 from composer.trainer import Trainer
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import dist, get_device, parallelism, reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -36,6 +37,7 @@
     process_init_device,
 )
 from llmfoundry.utils.registry_utils import import_file
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -52,7 +54,6 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    fsdp_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -62,9 +63,33 @@ def evaluate_model(
     callback_configs: Optional[dict[str, Any]],
     metadata: Optional[dict[str, str]],
     logged_config: dict[str, Any],
+    fsdp_config: Optional[dict[str, Any]] = None,
+    parallelism_config: Optional[dict[str, Any]] = None,
     should_log_config: bool = True,
     load_path: Optional[str] = None,
 ):
+    if parallelism_config:
+        deprecated_fsdp_args = list(
+            parallelism.FSDPConfig.__annotations__.keys(),
+        )
+        for deprecated_arg in deprecated_fsdp_args:
+            if deprecated_arg in parallelism_config:
+                raise ValueError(
+                    'parallelism_config cannot contain deprecated fsdp_config arguments.',
+                )
+
+    if fsdp_config:
+        warnings.warn(
+            VersionedDeprecationWarning(
+                'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
+                remove_version='0.13.0',
+            ),
+        )
+    if fsdp_config and parallelism_config:
+        raise ValueError(
+            'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.',
+        )
+
     log.info(f'Evaluating model: {model_name}')
     # Build tokenizer and model
     tokenizer_cfg = tokenizer
@@ -99,6 +124,10 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
+    fsdp_config = parallelism_config.get(
+        'fsdp_config',
+        None,
+    ) if parallelism_config else fsdp_config
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -146,7 +175,7 @@ def evaluate_model(
         callbacks=callbacks,
         loggers=loggers,
         precision=precision,
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         load_path=load_path,
         load_weights_only=True,
         progress_bar=False,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 4f1bd63c62..66ec739a65 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
+        parallelism_config={'fsdp': fsdp_config}
+        if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/eval/test_eval_deprecation.py b/tests/eval/test_eval_deprecation.py
new file mode 100644
index 0000000000..828186245a
--- /dev/null
+++ b/tests/eval/test_eval_deprecation.py
@@ -0,0 +1,125 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+import warnings
+
+from llmfoundry.command_utils.eval import evaluate_model
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
+
+
+class TestEvaluateModelDeprecation(unittest.TestCase):
+
+    def setUp(self):
+        self.common_args = { # type: ignore
+            'tokenizer': {
+                'name': 'test_tokenizer',
+            },
+            'model': {
+                'name': 'test_model',
+            },
+            'model_name': 'test',
+            'dist_timeout': 60,
+            'run_name': 'test_run',
+            'seed': 42,
+            'icl_tasks': [],
+            'max_seq_len': 512,
+            'device_eval_batch_size': 1,
+            'eval_gauntlet_config': None,
+            'eval_loader_config': None,
+            'loggers': [],
+            'python_log_level': None,
+            'precision': 'fp32',
+            'eval_gauntlet_df': None,
+            'eval_subset_num_batches': 1,
+            'icl_subset_num_batches': None,
+            'callback_configs': None,
+            'metadata': None,
+            'logged_config': {},
+        }
+
+    def test_no_deprecation_warning(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            import composer.utils.parallelism
+            deprecated_fsdp_args = list(
+                composer.utils.parallelism.FSDPConfig.__annotations__.keys(),
+            )
+            print(deprecated_fsdp_args)
+
+            try:
+                parallelism_config = {'fsdp': {'verbose': True}}
+                evaluate_model(
+                    **self.common_args,
+                    parallelism_config=parallelism_config,
+                )
+            except ValueError as ve:
+                if 'parallelism_config cannot contain deprecated fsdp_config arguments.' in str(
+                    ve,
+                ):
+                    self.fail(
+                        'Raised ValueError about deprecated fsdp_config arguments',
+                    )
+                elif 'Both fsdp_config and parallelism_config cannot be provided at the same time.' in str(
+                    ve,
+                ):
+                    self.fail(
+                        'Raised ValueError about both configs being provided',
+                    )
+            except Exception:
+                pass
+
+            deprecation_warnings = [
+                warning for warning in w
+                if isinstance(warning.message, VersionedDeprecationWarning)
+            ]
+            if deprecation_warnings:
+                self.fail('VersionedDeprecationWarning was raised')
+
+    def test_deprecation_warning_with_deprecated_arg(self):
+        # Use assertRaises to catch the expected ValueError
+        with self.assertRaises(ValueError) as context:
+            # Directly call evaluate_model; do not use try-except here
+            evaluate_model(
+                **self.common_args,
+                parallelism_config={'activation_checkpointing': True},
+            )
+
+        # Assert that the correct error message is in the exception
+        self.assertIn(
+            'parallelism_config cannot contain deprecated fsdp_config arguments.',
+            str(context.exception),
+        )
+
+    def test_deprecation_warning_with_fsdp_config(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+
+            try:
+                evaluate_model(
+                    **self.common_args,
+                    parallelism_config=None,
+                    fsdp_config={'verbose': True},
+                )
+            except Exception:
+                pass
+
+            self.assertTrue(
+                any(
+                    issubclass(warning.category, VersionedDeprecationWarning)
+                    for warning in w
+                ),
+            )
+
+    def test_error_with_both_fsdp_and_parallelism_config(self):
+        with self.assertRaises(ValueError) as context:
+            evaluate_model(
+                **self.common_args,
+                parallelism_config={'some_arg': True},
+                fsdp_config={'some_arg': True},
+            )
+
+        self.assertIn(
+            'Both fsdp_config and parallelism_config cannot be provided at the same time.',
+            str(context.exception),
+        )
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 69ced673a1..8e6c113169 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 56cb36c8c1..01acc22a60 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index a41574538a..366bcf7786 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
     )
 
     assert trainer.state.fsdp_enabled

From d7c78229e91129d4c35006209fabd5fb2f2252e9 Mon Sep 17 00:00:00 2001
From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com>
Date: Sun, 22 Sep 2024 14:03:42 -0400
Subject: [PATCH 09/17] Fix reuse kv cache for torch attention (#1539)

---
 llmfoundry/models/layers/attention.py   |  3 +++
 tests/models/layers/test_flash_torch.py | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index a1af2235cf..625327767e 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -656,6 +656,9 @@ def get_qkv(
                     'prev_layer_key_value is None, cannot reuse_prev_layer_kv.',
                 )
             key, value = prev_layer_key_value
+            if self.attn_impl == 'torch':
+                key = rearrange(key, 'b h d s -> b s (h d)')
+                value = rearrange(value, 'b h s d -> b s (h d)')
 
             query = self.Wq(x)
             if self.clip_qkv:
diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py
index 01a6a7576d..0a4b32a73a 100644
--- a/tests/models/layers/test_flash_torch.py
+++ b/tests/models/layers/test_flash_torch.py
@@ -188,7 +188,7 @@ def gen_bias(attn_impl: str):
                 alibi=alibi,
                 alibi_bias_max=8,
             )
-        if attn_impl != 'flash' and attn_uses_sequence_id and sequence_id is not None:
+        if attn_impl == 'torch' and attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)  # pyright
             attn_bias = apply_sequence_id(
                 attn_bias,
@@ -561,8 +561,10 @@ def test_grouped_query_invalid_heads():
         },
     }],
 )
+@pytest.mark.parametrize('attn_impl', ['flash', 'torch'])
 def test_reuse_prev_layer_kv_cache(
     pos_emb_config: dict,
+    attn_impl: str,
     device: str = 'cuda',
 ):
     """Checks reusing previous layer's kv cache."""
@@ -570,7 +572,7 @@ def test_reuse_prev_layer_kv_cache(
     rope = pos_emb_config['rope']
 
     cfg = {
-        'attn_impl': 'flash',
+        'attn_impl': attn_impl,
         'd_model': 64,
         'n_heads': 4,
         'attn_pdrop': 0,
@@ -630,6 +632,13 @@ def gen_bias(attn_impl: str):
                 alibi=alibi,
                 alibi_bias_max=8,
             )
+        if attn_impl == 'torch':
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            attn_bias = apply_sequence_id(
+                attn_bias,
+                sequence_id,  # type: ignore
+                s,
+            )
 
         return attn_bias
 
@@ -637,7 +646,7 @@ def gen_bias(attn_impl: str):
         sequence_id=sequence_id,
         S=s,
         attn_uses_sequence_id=True,
-        attn_impl='flash',
+        attn_impl=attn_impl,
         attention_mask=attention_mask,
     )
 
@@ -656,7 +665,7 @@ def gen_bias(attn_impl: str):
     x1.requires_grad = True
 
     with torch.autocast(x0.device.type):
-        attn_bias_0 = gen_bias('flash')
+        attn_bias_0 = gen_bias(attn_impl)
         alibi_slopes_0 = None
         if alibi:
             alibi_slopes_0 = gen_slopes(
@@ -703,7 +712,7 @@ def gen_bias(attn_impl: str):
             flash_attn_padding_info=flash_attn_padding_info,
             alibi_slopes=alibi_slopes_0,
         )
-        attn_bias_1 = gen_bias('flash')
+        attn_bias_1 = gen_bias(attn_impl)
         alibi_slopes_1 = None
         if alibi:
             alibi_slopes_1 = gen_slopes(

From 14cff668750dc08eb4511ddee0d55b127e711dea Mon Sep 17 00:00:00 2001
From: Milo Cress <iamroot@databricks.com>
Date: Sun, 22 Sep 2024 19:49:21 -0400
Subject: [PATCH 10/17] Error on text dataset file not found (#1534)

---
 .../data_prep/convert_text_to_mds.py              | 15 ++++++++++-----
 llmfoundry/utils/exceptions.py                    | 11 +++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 9a1f8a912d..3ea5aeb5d4 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -32,6 +32,7 @@
     CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
+    InputFolderNotFound,
     OutputFolderNotEmptyError,
 )
 
@@ -125,11 +126,15 @@ def get_object_names(input_folder: str) -> list[str]:
     object_store = maybe_create_object_store_from_uri(input_folder)
     if object_store is not None:
         _, _, folder_prefix = parse_uri(input_folder)
-        names = [
-            name for name in object_store.list_objects(folder_prefix)
-            if name.endswith('.txt')
-        ]
-        log.info(f'Found {len(names)} text files in remote storage')
+        try:
+            names = [
+                name for name in object_store.list_objects(folder_prefix)
+                if name.endswith('.txt')
+            ]
+            log.info(f'Found {len(names)} text files in remote storage')
+        except FileNotFoundError:
+            raise InputFolderNotFound(folder_prefix)
+
     else:
         # input_folder is a local folder
         names = [
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 11895564f2..900355dff5 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -348,6 +348,17 @@ def __init__(self, input_folder: str) -> None:
         super().__init__(message, input_folder=input_folder)
 
 
+class InputFolderNotFound(UserError):
+    """Error thrown when the a folder is not found."""
+
+    def __init__(self, folder_that_was_not_found: str) -> None:
+        message = f'{folder_that_was_not_found} not found.'
+        super().__init__(
+            message,
+            folder_that_was_not_found=folder_that_was_not_found,
+        )
+
+
 class CannotUnicodeDecodeFile(UserError):
     """Error thrown when the input folder is missing data."""
 

From a2c0507795a887b6fb71d3ef975b714523fe2abb Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Sun, 22 Sep 2024 18:23:51 -0700
Subject: [PATCH 11/17] Make ICL tasks not required for eval (#1540)

---
 llmfoundry/command_utils/eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index e644ad1f0f..70c4319ea8 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -262,7 +262,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
         EvalConfig,
         EVAL_CONFIG_KEYS,
         transforms=[allow_toplevel_keys],
-        icl_tasks_required=True,
+        icl_tasks_required=False,
     )
 
     model_configs = eval_config.models
@@ -273,7 +273,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
     # Mandatory Evaluation Parameters
     icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str
     if icl_tasks is None:
-        raise ValueError('icl_tasks must be specified in the config')
+        icl_tasks = []
 
     # Optional Evaluation Parameters with default values
     eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders

From 85403c086710bc0f62d03fc03c0fcbb2e5ffda1d Mon Sep 17 00:00:00 2001
From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com>
Date: Mon, 23 Sep 2024 10:37:26 -0400
Subject: [PATCH 12/17] Bumping flash attention version to 2.6.3 and adding
 option for softcap in attention and lm_head logits. (#1374)

---
 llmfoundry/models/layers/attention.py      | 24 +++++-
 llmfoundry/models/mpt/configuration_mpt.py | 14 +++
 llmfoundry/models/mpt/modeling_mpt.py      |  6 ++
 llmfoundry/models/utils/config_defaults.py |  1 +
 setup.py                                   |  2 +-
 tests/models/layers/test_flash_attn.py     | 99 +++++++++++++++++++++-
 6 files changed, 140 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index 625327767e..612d6b9642 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -112,6 +112,7 @@ def scaled_multihead_dot_product_attention(
     dropout_p: float = 0.0,
     training: bool = False,
     needs_weights: bool = False,
+    attn_logit_softcapping: Optional[float] = None,
     sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
@@ -149,6 +150,11 @@ def scaled_multihead_dot_product_attention(
 
     attn_weight = q.matmul(k) * softmax_scale
 
+    if attn_logit_softcapping is not None:
+        attn_weight = attn_logit_softcapping * torch.tanh(
+            attn_weight / attn_logit_softcapping,
+        )
+
     if attn_bias is not None:
         # clamp to 0 necessary for torch 2.0 compile()
         _s_q = max(0, attn_bias.size(2) - s_q)
@@ -264,6 +270,7 @@ def flash_attn_fn(
     sliding_window_size: int = -1,
     alibi_slopes: Optional[torch.Tensor] = None,
     flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None,
+    attn_logit_softcapping: Optional[float] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     if key_padding_mask is not None:
@@ -381,13 +388,17 @@ def flash_attn_fn(
             return_attn_probs=needs_weights,
         )
     elif is_flash_v2_installed():
-        alibi_kwargs = {}
+        extra_attn_kwargs = {}
         if check_alibi_support('flash'):
-            alibi_kwargs = {'alibi_slopes': alibi_slopes}
+            extra_attn_kwargs['alibi_slopes'] = alibi_slopes
         elif alibi_slopes is not None:
             raise ValueError(
                 'alibi_slopes is only supported for flash-attn>=2.4.2',
             )
+        if is_flash_v2_installed(
+            v2_version='v2.6.2',
+        ) and attn_logit_softcapping is not None:
+            extra_attn_kwargs['softcap'] = attn_logit_softcapping
         output_unpad = flash_attn_interface.flash_attn_varlen_func(
             q=query_unpad,
             k=key_unpad,
@@ -401,7 +412,7 @@ def flash_attn_fn(
             causal=reset_is_causal,
             return_attn_probs=needs_weights,
             window_size=(sliding_window_size, sliding_window_size),
-            **alibi_kwargs,
+            **extra_attn_kwargs,
         )
     else:
         raise RuntimeError(
@@ -448,6 +459,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__()
@@ -463,6 +475,7 @@ def __init__(
         self.kv_n_heads = kv_n_heads
         self.sliding_window_size = sliding_window_size
         self.reuse_kv_layer_idx = reuse_kv_layer_idx
+        self.attn_logit_softcapping = attn_logit_softcapping
 
         self.kv_dim = kv_dim if kv_dim is not None else self.d_model
         self.head_dim = d_model // n_heads
@@ -625,6 +638,7 @@ def forward(
             dropout_p=self.attn_dropout_p,
             training=self.training,
             needs_weights=needs_weights,
+            attn_logit_softcapping=self.attn_logit_softcapping,
             sliding_window_size=self.sliding_window_size,
             **extra_attn_kwargs,
         )
@@ -853,6 +867,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__(
@@ -873,6 +888,7 @@ def __init__(
             bias=bias,
             sliding_window_size=sliding_window_size,
             reuse_kv_layer_idx=reuse_kv_layer_idx,
+            attn_logit_softcapping=attn_logit_softcapping,
             kv_dim=kv_dim,
         )
 
@@ -902,6 +918,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__(
@@ -922,6 +939,7 @@ def __init__(
             bias=bias,
             sliding_window_size=sliding_window_size,
             reuse_kv_layer_idx=reuse_kv_layer_idx,
+            attn_logit_softcapping=attn_logit_softcapping,
             kv_dim=kv_dim,
         )
 
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 91b431e3b4..dbcabdf5f9 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -51,6 +51,7 @@ def __init__(
         tie_word_embeddings: bool = True,
         use_pad_tok_in_ffn: bool = True,
         block_overrides: Optional[dict[str, Any]] = None,
+        final_logit_softcapping: Optional[float] = None,
         **kwargs: Any,
     ):
         """The MPT configuration class.
@@ -148,6 +149,7 @@ def __init__(
                             reuse_kv_layer:
                                 attn_config:
                                     reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
+            final_logit_softcapping (float | None): Softcapping threshold for final logit. Set to None to disable (default value None). Please see https://arxiv.org/pdf/2403.08295 for more details.
             kwargs (Any): Other relevant keyword arguments.
         """
         self.d_model = d_model
@@ -181,6 +183,7 @@ def __init__(
         if block_overrides is not None:
             self._validate_block_overrides(block_overrides)
         self.block_overrides = block_overrides
+        self.final_logit_softcapping = final_logit_softcapping
 
         if isinstance(fc_type, str):
             fc_type = {'name': fc_type}
@@ -325,6 +328,17 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).',
             )
+        if self.attn_config['attn_logit_softcapping'] is not None:
+            if self.attn_config['attn_logit_softcapping'] <= 0:
+                raise ValueError(
+                    'Attention attn_logit_softcapping should be positive.',
+                )
+            if self.attn_config[
+                'attn_impl'
+            ] == 'flash' and not is_flash_v2_installed(v2_version='v2.6.2',):
+                raise NotImplementedError(
+                    'Attention attn_logit_softcapping is only implemented with torch attention or flash attention v2.6.2 (or higher).',
+                )
         if self.attn_config['kv_dim'] is not None and self.attn_config[
             'fused_qkv']:
             raise ValueError(
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index cfe1172634..9212f5594d 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -1071,6 +1071,7 @@ def __init__(self, config: MPTConfig):
                         f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.",
                     )
             self.logit_scale = logit_scale
+        self.final_logit_softcapping = config.final_logit_softcapping
 
     @property
     def backbone_model_class(self) -> type[MPTModel]:
@@ -1172,6 +1173,11 @@ def forward(
                 )
             logits *= self.logit_scale
 
+        if self.final_logit_softcapping is not None:
+            logits = self.final_logit_softcapping * torch.tanh(
+                logits / self.final_logit_softcapping,
+            )
+
         loss = None
         if labels is not None:
             _labels = torch.roll(labels, shifts=-1)
diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py
index bd3b29a479..5550785149 100644
--- a/llmfoundry/models/utils/config_defaults.py
+++ b/llmfoundry/models/utils/config_defaults.py
@@ -18,6 +18,7 @@
     'softmax_scale': None,
     'attn_uses_sequence_id': False,
     'sliding_window_size': -1,
+    'attn_logit_softcapping': None,
     'alibi': False,
     'alibi_bias_max': 8,
     'rope': False,
diff --git a/setup.py b/setup.py
index 0a75c610b8..ebc66fdacf 100644
--- a/setup.py
+++ b/setup.py
@@ -104,7 +104,7 @@
 
 # Flash 2 group kept for backwards compatibility
 extra_deps['gpu-flash2'] = [
-    'flash-attn>=2.5.8,<3',
+    'flash-attn>=2.6.3,<3',
 ]
 
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py
index 987ea7160a..666d93c9b4 100644
--- a/tests/models/layers/test_flash_attn.py
+++ b/tests/models/layers/test_flash_attn.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from typing import Optional
 
 import pytest
 import torch
@@ -334,5 +335,99 @@ def gen_bias():
     _assert_approx_equal(value_1.grad, value_2.grad)
 
 
-def _assert_approx_equal(value1: torch.Tensor, value2: torch.Tensor):
-    assert torch.norm(value2 - value1) <= 1e-2 + 1e-2 * torch.norm(value2)
+@pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.6.2'),
+    reason=
+    'attn_logit_softcapping only supported by Flash Attention after v2.6.2.',
+)
+@pytest.mark.parametrize(
+    'attn_logit_softcapping',
+    [None, 0.1, 1.0, 10.0, 100.0],
+)
+def test_attn_logit_softcapping(attn_logit_softcapping: Optional[float]):
+    # Test that attn_logit_softcapping in attention works as expected.
+    dtype = torch.bfloat16
+    device = 'cuda'
+    d = 128
+    seqlen_1 = 8
+    bsz = 2
+    n_heads = 4
+
+    query_1 = torch.randn(bsz, seqlen_1,
+                          n_heads * d).to(dtype=dtype, device=device)
+    query_1.requires_grad = True
+    key_1 = torch.randn(bsz, seqlen_1,
+                        n_heads * d).to(dtype=dtype, device=device)
+    key_1.requires_grad = True
+    value_1 = torch.randn(bsz, seqlen_1,
+                          n_heads * d).to(dtype=dtype, device=device)
+    value_1.requires_grad = True
+    output_1, _, _ = flash_attn_fn(
+        query=query_1,
+        key=key_1,
+        value=value_1,
+        n_heads=n_heads,
+        kv_n_heads=n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        attn_bias=None,
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        flash_attn_padding_info=gen_flash_attn_padding_info(
+            bsz,
+            seqlen_1,
+            0,
+            query_1.device,
+            None,
+            None,
+        ),
+        should_repeat_kv_for_gqa=True,
+        attn_logit_softcapping=attn_logit_softcapping,
+    )
+    output_1.sum().backward()
+
+    query_2 = query_1.detach().clone()
+    query_2.requires_grad = True
+    key_2 = key_1.detach().clone()
+    key_2.requires_grad = True
+    value_2 = value_1.detach().clone()
+    value_2.requires_grad = True
+    output_2, _, _ = scaled_multihead_dot_product_attention(
+        query=query_2,
+        key=key_2,
+        value=value_2,
+        n_heads=n_heads,
+        kv_n_heads=n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        attn_logit_softcapping=attn_logit_softcapping,
+    )
+    output_2.sum().backward()
+
+    _assert_approx_equal(output_1, output_2)
+    assert (query_2.grad is not None) and (query_1.grad is not None)
+    _assert_approx_equal(query_1.grad, query_2.grad)
+    assert (key_2.grad is not None) and (key_1.grad is not None)
+    _assert_approx_equal(key_1.grad, key_2.grad)
+    assert (value_2.grad is not None) and (value_1.grad is not None)
+    _assert_approx_equal(value_1.grad, value_2.grad)
+
+
+def _assert_approx_equal(
+    value1: torch.Tensor,
+    value2: torch.Tensor,
+    atol: float = 1e-2,
+    rtol: float = 1e-2,
+):
+    actual_difference = torch.norm(value2 - value1)
+    allowed_difference = atol + rtol * torch.norm(value2)
+    assert actual_difference < allowed_difference, f'{actual_difference=}, {allowed_difference=}'

From f377090dec102afc646fb29a4510ded6ae74ecf9 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:00:07 -0700
Subject: [PATCH 13/17] Register mosaic logger (#1542)

---
 llmfoundry/loggers/__init__.py         |  2 ++
 tests/loggers/test_mosaic_ml_logger.py | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 tests/loggers/test_mosaic_ml_logger.py

diff --git a/llmfoundry/loggers/__init__.py b/llmfoundry/loggers/__init__.py
index cd3f3fdc62..c60d9be2cd 100644
--- a/llmfoundry/loggers/__init__.py
+++ b/llmfoundry/loggers/__init__.py
@@ -4,6 +4,7 @@
 from composer.loggers import (
     InMemoryLogger,
     MLFlowLogger,
+    MosaicMLLogger,
     TensorboardLogger,
     WandBLogger,
 )
@@ -18,3 +19,4 @@
     func=InMemoryLogger,
 )  # for backwards compatibility
 loggers.register('mlflow', func=MLFlowLogger)
+loggers.register('mosaicml', func=MosaicMLLogger)
diff --git a/tests/loggers/test_mosaic_ml_logger.py b/tests/loggers/test_mosaic_ml_logger.py
new file mode 100644
index 0000000000..e9c003321b
--- /dev/null
+++ b/tests/loggers/test_mosaic_ml_logger.py
@@ -0,0 +1,16 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from composer.loggers import MosaicMLLogger
+
+from llmfoundry.utils.builders import build_logger
+
+
+def test_mosaic_ml_logger_constructs():
+    mosaic_ml_logger = build_logger(
+        'mosaicml',
+        kwargs={'ignore_exceptions': True},
+    )
+
+    assert isinstance(mosaic_ml_logger, MosaicMLLogger)
+    assert mosaic_ml_logger.ignore_exceptions == True

From d85c83b15d5b07a1b8cd00eaa7e400aaf7b22ea7 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Mon, 23 Sep 2024 23:24:16 -0700
Subject: [PATCH 14/17] Hfcheckpointer optional generation config (#1543)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/callbacks/hf_checkpointer.py       |  7 ++-
 .../inference/test_convert_composer_to_hf.py  | 56 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 65bdcb3b6c..4365a5b2e5 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -588,9 +588,10 @@ def tensor_hook(
                     del new_base_model_instance
                 else:
                     new_model_instance = type(original_model)(new_config)
-                    new_model_instance.generation_config.update(
-                        **original_model.generation_config.to_dict(),
-                    )
+                    if new_model_instance.generation_config is not None:
+                        new_model_instance.generation_config.update(
+                            **original_model.generation_config.to_dict(),
+                        )
 
             # Then load the state dict in with "assign" so that the state dict
             # is loaded properly even though the model is initially on meta device.
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 66ec739a65..bf5f2a970b 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -8,13 +8,14 @@
 import pathlib
 import shutil
 from argparse import Namespace
-from typing import Any, Callable, Optional, cast
+from typing import Any, Callable, Optional, Union, cast
 from unittest import mock
 from unittest.mock import ANY, MagicMock, patch
 
 import catalogue
 import pytest
 import torch
+import torch.nn as nn
 import transformers
 from composer import ComposerModel, Trainer
 from composer.loggers import MLFlowLogger
@@ -23,7 +24,13 @@
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor.api import DTensor
 from torch.utils.data import DataLoader
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import (
+    AutoConfig,
+    GenerationConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
 
 from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename
@@ -1637,3 +1644,48 @@ def test_license_file_finder(
     found_path = _maybe_get_license_filename(str(tmp_path))
     assert (found_path == license_file_name
            ) if license_file_name is not None else (found_path is None)
+
+
+@pytest.mark.parametrize('generation_config', [None, {}, {'max_length': 200}])
+def test_generation_config_variants(
+    generation_config: Optional[Union[dict[str, Any], GenerationConfig]],
+):
+
+    class MockModel(nn.Module):
+
+        def __init__(self, config: PretrainedConfig):
+            super().__init__()
+            self.config = config
+            # Ensure generation_config is always a GenerationConfig object
+            if isinstance(config.generation_config, dict):
+                self.generation_config = GenerationConfig(
+                    **config.generation_config,
+                )
+            else:
+                self.generation_config = config.generation_config
+
+    config = AutoConfig.from_pretrained('gpt2')
+    # Convert dict to GenerationConfig if needed
+    if isinstance(generation_config, dict):
+        generation_config = GenerationConfig(**generation_config)
+    config.generation_config = generation_config
+
+    mock_model = MockModel(config)
+    logger = MagicMock()
+    state = MagicMock()
+    state.timestamp.batch = 1
+    state.is_model_ddp = False
+    state.model.model = mock_model
+    state.model.tokenizer = None
+
+    checkpointer = HuggingFaceCheckpointer(
+        save_folder='test',
+        save_interval='1ba',
+    )
+
+    checkpointer._save_checkpoint(
+        state=state,
+        logger=logger,
+        upload_to_save_folder=False,
+        register_to_mlflow=False,
+    )

From 275a2a40d86a36882cc7963e2677628e05aaaf01 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:57:21 -0700
Subject: [PATCH 15/17] Bump composer version to 0.25.0 (#1546)

---
 setup.py                                                 | 8 ++++----
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index ebc66fdacf..48c1326b0d 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.24.1,<0.25',
+    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.25.0,<0.26',
     'mlflow>=2.14.1,<2.17',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.44',
@@ -91,7 +91,7 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.24.1,<0.25',
+    'mosaicml[databricks]>=0.25.0,<0.26',
     'numpy<2',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
@@ -99,7 +99,7 @@
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.24.1,<0.25',
+    'mosaicml[tensorboard]>=0.25.0,<0.26',
 ]
 
 # Flash 2 group kept for backwards compatibility
@@ -110,7 +110,7 @@
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.24.1,<0.25',
+    'mosaicml[peft]>=0.25.0,<0.26',
 ]
 
 extra_deps['openai'] = [
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index bf5f2a970b..c25432dc48 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1563,6 +1563,8 @@ def test_mptmoe_huggingface_conversion_callback(
 
             # Check output equivalence
             loaded_model = loaded_model.cuda().bfloat16()  # type: ignore
+            for k, v in batch.items():
+                batch[k] = v.cuda()
             loaded_model_logits = loaded_model(
                 input_ids=batch.get('input_ids', None),
                 attention_mask=batch.get('attention_mask', None),

From 151a2e297b603d84e1e4dfed389c3494990936e6 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:53:05 -0700
Subject: [PATCH 16/17] Bump streaming version to 0.9.0 (#1550)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 48c1326b0d..d1979faf63 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,7 @@
     'mlflow>=2.14.1,<2.17',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.44',
-    'mosaicml-streaming>=0.8.1,<0.9',
+    'mosaicml-streaming>=0.9.0,<0.10',
     'torch>=2.4.0,<2.4.1',
     'datasets>=2.19,<2.20',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data

From 722526d420dab9adc5a5be18425d5e08c97ee0c8 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 25 Sep 2024 09:25:27 -0700
Subject: [PATCH 17/17] Bump version to 0.13.0.dev0 (#1549)

---
 llmfoundry/_version.py                | 2 +-
 llmfoundry/command_utils/eval.py      | 2 +-
 llmfoundry/models/hf/model_wrapper.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
index 2f1f590b19..0cddcaf967 100644
--- a/llmfoundry/_version.py
+++ b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.12.0.dev0'
+__version__ = '0.13.0.dev0'
diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 70c4319ea8..73127e8a07 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -82,7 +82,7 @@ def evaluate_model(
         warnings.warn(
             VersionedDeprecationWarning(
                 'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
     if fsdp_config and parallelism_config:
diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
index c8805e5d6d..f2b67db1ec 100644
--- a/llmfoundry/models/hf/model_wrapper.py
+++ b/llmfoundry/models/hf/model_wrapper.py
@@ -48,7 +48,7 @@ def __init__(
         warnings.warn(
             VersionedDeprecationWarning(
                 '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
         super().__init__(