From e72bbaf7d5fe36a55a0e5787e95cd215d3e96306 Mon Sep 17 00:00:00 2001
From: Vincent Chen <vincent@mosaicml.com>
Date: Fri, 7 Jun 2024 20:12:51 -0400
Subject: [PATCH 01/69] bump (#3383)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 .github/workflows/code-quality.yaml   | 2 +-
 .github/workflows/codeql-analysis.yml | 2 +-
 .github/workflows/coverage.yaml       | 2 +-
 .github/workflows/daily.yaml          | 4 ++--
 .github/workflows/pr-cpu.yaml         | 2 +-
 .github/workflows/pr-gpu.yaml         | 6 +++---
 .github/workflows/release.yaml        | 2 +-
 .github/workflows/smoketest.yaml      | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 338fa77a17..c35546f4ca 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -34,7 +34,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index a8a510bffb..0cb835fbde 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -45,7 +45,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/codeql-analysis
       with:
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index 1bdae1efb8..9432e8c6c9 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -16,7 +16,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/coverage
       with:
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 320c1a5fe6..6b67e857ec 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -14,7 +14,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   daily-pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8
     strategy:
       matrix:
         include:
@@ -100,7 +100,7 @@ jobs:
       download-path: artifacts
 
   daily-pytest-gpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
     strategy:
       matrix:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index f32a589160..1bdb383823 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 3cb434ca58..f056292a43 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
     strategy:
       matrix:
         include:
@@ -35,7 +35,7 @@ jobs:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
 
   pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
     strategy:
       matrix:
         include:
@@ -62,7 +62,7 @@ jobs:
 
 
   pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index faabebc7ac..0b253ea87f 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -24,7 +24,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 08291b5c0e..e9c6316a8d 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -33,7 +33,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/smoketest
       with:

From 9c4b0ba2f899ed779017b7ed2d856348ceb43eb3 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Fri, 7 Jun 2024 17:59:19 -0700
Subject: [PATCH 02/69] Fix backward compatibility caused by missing eval
 metrics class  (#3385)

* a

* a'

* a

* a

* a

* a

* a

* a

* a

* Apply suggestions from code review

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/metrics/nlp.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index 2f60f8d1c9..e6877292cf 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -178,3 +178,27 @@ def compute(self) -> Tensor:
         """Returns torch.exp() of the LanguageCrossEntropy."""
         avg_loss = super().compute()
         return torch.exp(avg_loss)
+
+
+# For backward compatibility
+class InContextLearningMetric:
+    """InContextLearningMetric only exists for backwards compatibility of checkpoints that contain pickled metrics."""
+
+    def __init__(self):
+        raise RuntimeError(
+            f'This class only exists for maintaining backward compatibility for checkpoints that contain pickled metrics. Please instead use https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/README.md.',
+        )
+
+    def __getstate__(self):
+        return None
+
+    def __setstate__(self, state):
+        pass
+
+
+InContextLearningCodeEvalAccuracy = InContextLearningMetric
+InContextLearningLMAccuracy = InContextLearningMetric
+InContextLearningLMExpectedCalibrationError = InContextLearningMetric
+InContextLearningMCExpectedCalibrationError = InContextLearningMetric
+InContextLearningQAAccuracy = InContextLearningMetric
+InContextLearningMultipleChoiceAccuracy = InContextLearningMetric

From e85e7385544b6ef6de13beb50b76d21f731fa6f1 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Fri, 7 Jun 2024 18:24:35 -0700
Subject: [PATCH 03/69] Bump version v0.23.2 (#3386)

* a

* bump
---
 composer/_version.py            |  2 +-
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 12 ++++++------
 docker/generate_build_matrix.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/composer/_version.py b/composer/_version.py
index a38b61a722..50d801763e 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.24.0.dev0'
+__version__ = '0.23.2'
diff --git a/docker/README.md b/docker/README.md
index 76128b6e92..05c97fe626 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.23.1             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.1`         |
-| 0.23.1             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.1_cpu` |
+| 0.23.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2`         |
+| 0.23.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 332d7deb5e..73074988b9 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -208,9 +208,9 @@
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: composer-0-23-1
+  IMAGE_NAME: composer-0-23-2
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -231,15 +231,15 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.1
+  - mosaicml/composer:0.23.2
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-23-1-cpu
+  IMAGE_NAME: composer-0-23-2-cpu
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -247,7 +247,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.1_cpu
+  - mosaicml/composer:0.23.2_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index f0398ed750..bf961a756c 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -231,7 +231,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.23.1']  # Only build images for the latest composer version
+    composer_versions = ['0.23.2']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

From afa2e397b4a073fa1c99e3b5841013b9f8f74cb1 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Fri, 7 Jun 2024 19:46:59 -0700
Subject: [PATCH 04/69] Restore dev version (#3388)

* a

* a
---
 composer/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/_version.py b/composer/_version.py
index 50d801763e..a38b61a722 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.23.2'
+__version__ = '0.24.0.dev0'

From 4cbb4a21aec7f8d4ed0b12417dadc7f335e383c7 Mon Sep 17 00:00:00 2001
From: Antoine Broyelle <antoine.broyelle@helsing.ai>
Date: Sun, 9 Jun 2024 20:36:53 +0100
Subject: [PATCH 05/69] Only requires `databricks-sdk` when inside the
 Databricks platform (#3389)

---
 composer/loggers/mlflow_logger.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index f24c5f956f..92b3fc2657 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -88,7 +88,6 @@ def __init__(
     ) -> None:
         try:
             import mlflow
-            from databricks.sdk import WorkspaceClient
             from mlflow import MlflowClient
         except ImportError as e:
             raise MissingConditionalImportError(
@@ -143,9 +142,19 @@ def __init__(
                     DEFAULT_MLFLOW_EXPERIMENT_NAME,
                 )
             assert self.experiment_name is not None  # type hint
+
             if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'):
+                try:
+                    from databricks.sdk import WorkspaceClient
+                except ImportError as e:
+                    raise MissingConditionalImportError(
+                        extra_deps_group='mlflow',
+                        conda_package='databricks-sdk',
+                        conda_channel='conda-forge',
+                    ) from e
                 databricks_username = WorkspaceClient().current_user.me().user_name or ''
                 self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name)
+
             self._mlflow_client = MlflowClient(self.tracking_uri)
             # Set experiment
             env_exp_id = os.getenv(

From 735aa6fa72a0d3799f74d6329e5d38167d35a54f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 9 Jun 2024 18:56:37 -0700
Subject: [PATCH 06/69] Update packaging requirement from <24.1,>=21.3.0 to
 >=21.3.0,<24.2 (#3392)

Updates the requirements on [packaging](https://github.com/pypa/packaging) to permit the latest version.
- [Release notes](https://github.com/pypa/packaging/releases)
- [Changelog](https://github.com/pypa/packaging/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pypa/packaging/compare/21.3...24.1)

---
updated-dependencies:
- dependency-name: packaging
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0b40fe0c72..38aa4799b8 100644
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'coolname>=1.1.0,<3',
     'tabulate==0.9.0',  # for auto-generating tables
     'py-cpuinfo>=8.0.0,<10',
-    'packaging>=21.3.0,<24.1',
+    'packaging>=21.3.0,<24.2',
     'importlib-metadata>=5.0.0,<7',
     'mosaicml-cli>=0.5.25,<0.7',
 ]

From db1325a60f7839dd0e86f029e1ac49cc9a8a5dc4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 9 Jun 2024 19:27:14 -0700
Subject: [PATCH 07/69] Bump cryptography from 42.0.6 to 42.0.8 (#3391)

Bumps [cryptography](https://github.com/pyca/cryptography) from 42.0.6 to 42.0.8.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/42.0.6...42.0.8)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 38aa4799b8..2eba4e39e6 100644
--- a/setup.py
+++ b/setup.py
@@ -139,7 +139,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'GitPython==3.1.43',
     'moto[s3]>=4.0.1,<5',
     'mock-ssh-server==0.9.1',
-    'cryptography==42.0.6',
+    'cryptography==42.0.8',
     'pytest-httpserver>=1.0.4,<1.1',
     'setuptools<=59.5.0',
     'pillow==9.3.0',  # Matches the Pillow version listed in the Dockerfile

From 7778fcf0f80666cd22c789470cf3365ee8e8c041 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 11 Jun 2024 14:25:56 -0400
Subject: [PATCH 08/69] Skip extra dataset state load (#3393)

* fix edge case

* fix
---
 composer/core/state.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index 083b977811..0864b50aaf 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -707,8 +707,10 @@ def train_dataloader(self, train_dataloader: Optional[Union[Iterable, DataLoader
             train_dataloader (Iterable | DataLoader, optional): The dataloader.
         """
         self._train_dataloader = train_dataloader
-        # Load dataset state from checkpoint when train_dataloader is set
-        if self.dataset_state:
+        # Load dataset state from checkpoint when train_dataloader is set. This occurs if
+        # dataset_state was loaded from checkpoint and train_dataloader has not already
+        # consumed dataset_state['train'] to resume.
+        if self.dataset_state is not None and self.dataset_state.get('train') is not None:
             dataset = self._dataset_of(self._train_dataloader)
             if hasattr(dataset, 'load_state_dict'):
                 dataset.load_state_dict(self.dataset_state['train'])  # pyright: ignore
@@ -1278,14 +1280,14 @@ def _load_dataset_state(self, obj: dict[str, Any]) -> None:
         Args:
             obj (dict[str, Any]): The state to load.
         """
-        self.dataset_state = obj
-
         dataset = self._dataset_of(self.train_dataloader)
         if hasattr(dataset, 'load_state_dict'):
             dataset.load_state_dict(obj['train'])  # pyright: ignore
             obj['train'] = None
             self.dataset_resumption['train'] = True
 
+        self.dataset_state = obj
+
     def load_model_state(
         self,
         state_dict: dict[str, Any],

From 919fe91557ee17c77cf09e9d405a1c4396ab869d Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 12 Jun 2024 11:16:26 -0400
Subject: [PATCH 09/69] Remove FSDP restriction from PyTorch 1.13 (#3395)

* remove torch 113

* lint
---
 composer/distributed/dist_strategy.py | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
index 1cc1044a02..be81652881 100644
--- a/composer/distributed/dist_strategy.py
+++ b/composer/distributed/dist_strategy.py
@@ -328,36 +328,12 @@ def sync_hook(*args):
 
     mixed_precision = fsdp_config.mixed_precision
     keep_low_precision_grads = fsdp_config.keep_low_precision_grads
-    mixed_precision, param_dtype, _, _ = get_mixed_precision(
+    mixed_precision, _, _, _ = get_mixed_precision(
         precision,
         mixed_precision=mixed_precision,
         keep_low_precision_grads=keep_low_precision_grads,
     )
 
-    # Note: FSDP does support the use of torch.float32 with sharding.
-    # They just never expected a user to pass in torch.float32 into mixed_precision as a param_dtype.
-    # See: https://github.com/pytorch/pytorch/issues/90584
-    # The PR fixing this bug is merged into PyTorch, but it hasn't made its way into a release yet.
-    # Instead a user needs to pass in `None` as param_dtype to have the parameters as torch.float32.
-    # TODO: remove these checks when PyTorch has a release that includes the fix.
-    if sharding_map_key != 'NO_SHARD':
-        if (
-            precision == Precision.AMP_FP16 and param_dtype not in [torch.float16, None] or
-            precision == Precision.AMP_BF16 and param_dtype not in [torch.bfloat16, None]
-        ):
-            raise ValueError(
-                f'FSDP in PyTorch 1.13 does not support precision `{precision}` with sharding strategy `{sharding_strategy}` '
-                f'and param_dtype `{param_dtype}.` Consider using one of the predefined mixed_precision strategies '
-                "(choose: `'FULL'`, `'DEFAULT'`, `'PURE'`)",
-            )
-
-        if param_dtype == torch.float32:
-            raise ValueError(
-                f'FSDP in PyTorch 1.13 does not support param_dtype `{param_dtype}` with sharding_strategy `{sharding_map_key}` '
-                f'Consider using `amp` or `bf16` for precision or setting param_dtype in mixed_precision to `None` '
-                f'with sharding strategy `{sharding_map_key}.`',
-            )
-
     process_group = None
     if fsdp_config.process_group is not None:
         process_group_dict = {'process_group': fsdp_config.process_group}

From b07b82e5f815787fcb55d8643236ba456439090d Mon Sep 17 00:00:00 2001
From: Joe Early <joe.early@helsing.ai>
Date: Thu, 13 Jun 2024 18:29:33 +0100
Subject: [PATCH 10/69] Check for 'CUDA error: out of memory' with
 auto-microbatching (#3400)

---
 composer/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index c680d1d3d7..ba455cd78d 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -307,7 +307,7 @@ def _get_initial_device_train_microbatch_size(
 
 def _is_cuda_oom(e: RuntimeError):
     """Determines if error is CUDA Out of Memory and if auto_microbatching is enabled."""
-    if 'CUDA out of memory' in str(e):
+    if any(s in str(e) for s in ['CUDA out of memory', 'CUDA error: out of memory']):
         return True
     # With batch_norm, large batch sizes sometimes result in cuDNN instead of Cuda OOMs.
     if 'cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.' in str(

From 6298d76f216b533d60c2fd11db9a0f93851aebfc Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Thu, 13 Jun 2024 16:24:14 -0400
Subject: [PATCH 11/69] Add tokens to iterations (#3374)

---
 composer/core/callback.py           | 28 +++++++++++++------------
 composer/core/state.py              |  2 +-
 composer/core/time.py               | 32 +++++++++++++++++++++++++++++
 composer/trainer/trainer.py         | 32 ++++++++++++++++++++++-------
 tests/checkpoint/test_state_dict.py |  1 +
 tests/test_time.py                  | 10 +++++++--
 6 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/composer/core/callback.py b/composer/core/callback.py
index fef48ca1b1..897cf5f733 100644
--- a/composer/core/callback.py
+++ b/composer/core/callback.py
@@ -273,19 +273,21 @@ def batch_end(self, state: State, logger: Logger) -> None:
            The following :attr:`.State.timestamp` member variables are
            incremented immediately before the :attr:`.Event.BATCH_END` event.
 
-           +------------------------------------+
-           | :attr:`.Timestamp.batch`           |
-           +------------------------------------+
-           | :attr:`.Timestamp.batch_in_epoch`  |
-           +------------------------------------+
-           | :attr:`.Timestamp.sample`          |
-           +------------------------------------+
-           | :attr:`.Timestamp.sample_in_epoch` |
-           +------------------------------------+
-           | :attr:`.Timestamp.token`           |
-           +------------------------------------+
-           | :attr:`.Timestamp.token_in_epoch`  |
-           +------------------------------------+
+           +--------------------------------------+
+           | :attr:`.Timestamp.batch`             |
+           +--------------------------------------+
+           | :attr:`.Timestamp.batch_in_epoch`    |
+           +--------------------------------------+
+           | :attr:`.Timestamp.sample`            |
+           +--------------------------------------+
+           | :attr:`.Timestamp.sample_in_epoch`   |
+           +--------------------------------------+
+           | :attr:`.Timestamp.token`             |
+           +--------------------------------------+
+           | :attr:`.Timestamp.token_in_epoch`    |
+           +--------------------------------------+
+           | :attr:`.Timestamp.token_in_iteration`|
+           +--------------------------------------+
 
         Args:
             state (State): The training state.
diff --git a/composer/core/state.py b/composer/core/state.py
index 0864b50aaf..fa4feaec75 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -766,7 +766,7 @@ def _iteration_length(self, iteration_length: Optional[Union[str, Time[int]]]):
             return
         if isinstance(iteration_length, str):
             iteration_length = ensure_time(iteration_length, TimeUnit.EPOCH)
-        if iteration_length.unit != TimeUnit.EPOCH:
+        if iteration_length.unit != TimeUnit.EPOCH and iteration_length.unit != TimeUnit.TOKEN:
             raise NotImplementedError(f'{iteration_length.unit} is not allowed as a unit for iteration_length.')
         self.__iteration_length = iteration_length
 
diff --git a/composer/core/time.py b/composer/core/time.py
index c21f377026..3916dd7659 100644
--- a/composer/core/time.py
+++ b/composer/core/time.py
@@ -473,6 +473,7 @@ class Timestamp(Serializable):
         sample (int | Time[int], optional): The sample.
         token (int | Time[int], optional): The token.
         epoch_in_iteration (int | Time[int], optional): The epoch in the iteration.
+        token_in_iteration (int | Time[int], optional): The token in the iteration.
         batch_in_epoch (int | Time[int], optional): The batch in the epoch.
         sample_in_epoch (int | Time[int], optional): The sample in the epoch.
         token_in_epoch (int | Time[int], optional): The token in the epoch.
@@ -490,6 +491,7 @@ def __init__(
         sample: Union[int, Time[int]] = 0,
         token: Union[int, Time[int]] = 0,
         epoch_in_iteration: Union[int, Time[int]] = 0,
+        token_in_iteration: Union[int, Time[int]] = 0,
         batch_in_epoch: Union[int, Time[int]] = 0,
         sample_in_epoch: Union[int, Time[int]] = 0,
         token_in_epoch: Union[int, Time[int]] = 0,
@@ -531,6 +533,14 @@ def __init__(
             ))
         self._epoch_in_iteration = epoch_in_iteration
 
+        token_in_iteration = Time.from_input(token_in_iteration, TimeUnit.TOKEN)
+        if token_in_iteration.unit != TimeUnit.TOKEN:
+            raise ValueError((
+                f'The `token_in_iteration` argument has units of {token_in_iteration.unit}; '
+                f'not {TimeUnit.TOKEN}.'
+            ))
+        self._token_in_iteration = token_in_iteration
+
         batch_in_epoch = Time.from_input(batch_in_epoch, TimeUnit.BATCH)
         if batch_in_epoch.unit != TimeUnit.BATCH:
             raise ValueError(
@@ -579,6 +589,7 @@ def state_dict(self) -> dict[str, Any]:
             'sample': self.sample.value,
             'token': self.token.value,
             'epoch_in_iteration': self.epoch_in_iteration.value,
+            'token_in_iteration': self.token_in_iteration.value,
             'batch_in_epoch': self.batch_in_epoch.value,
             'sample_in_epoch': self.sample_in_epoch.value,
             'token_in_epoch': self.token_in_epoch.value,
@@ -609,6 +620,8 @@ def load_state_dict(self, state: dict[str, Any]) -> None:
             self._iteration = Time(state['iteration'], TimeUnit.ITERATION)
         if 'epoch_in_iteration' in state:
             self._epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH)
+        if 'token_in_iteration' in state:
+            self._token_in_iteration = Time(state['token_in_iteration'], TimeUnit.TOKEN)
         if 'iteration_wct' in state:
             self._iteration_wct = state['iteration_wct']
 
@@ -642,6 +655,11 @@ def epoch_in_iteration(self) -> Time[int]:
         """The epoch count in the current iteration (resets at 0 at the beginning of every iteration)."""
         return self._epoch_in_iteration
 
+    @property
+    def token_in_iteration(self) -> Time[int]:
+        """The token count in the current iteration (resets at 0 at the beginning of every iteration)."""
+        return self._token_in_iteration
+
     @property
     def batch_in_epoch(self) -> Time[int]:
         """The batch count in the current epoch (resets at 0 at the beginning of every epoch)."""
@@ -814,6 +832,7 @@ def to_next_batch(
             sample_in_epoch=self.sample_in_epoch + samples,
             token=self.token + tokens,
             token_in_epoch=self.token_in_epoch + tokens,
+            token_in_iteration=self.token_in_iteration + tokens,
             total_wct=self.total_wct + duration,
             iteration_wct=self.iteration_wct + duration,
             epoch_wct=self.epoch_wct + duration,
@@ -822,6 +841,7 @@ def to_next_batch(
 
     def to_next_epoch(
         self,
+        tokens: Union[int, Time] = 0,
         duration: Optional[datetime.timedelta] = None,
     ):
         """Create a new :class:`.Timestamp`, advanced to the next epoch.
@@ -841,6 +861,7 @@ def to_next_epoch(
             >>> timestamp.copy(
             ...     epoch=timestamp.epoch + 1,
             ...     epoch_in_iteration=timestamp.epoch_in_iteration + 1,
+            ...     token_in_iteration=timestamp.token_in_iteration + tokens,
             ...     batch_in_epoch=0,
             ...     sample_in_epoch=0,
             ...     token_in_epoch=0,
@@ -851,12 +872,17 @@ def to_next_epoch(
             ... )
             Timestamp(...)
 
+        Args:
+            tokens (int | Time, optional): The number of tokens trained in the batch. Defaults to 0.
+            duration (datetime.timedelta, optional): The duration to train the batch.
+
         """
         if duration is None:
             duration = datetime.timedelta(seconds=0)
         return self.copy(
             epoch=self.epoch + 1,
             epoch_in_iteration=self.epoch_in_iteration + 1,
+            token_in_iteration=self.token_in_iteration + tokens,
             batch_in_epoch=0,
             sample_in_epoch=0,
             token_in_epoch=0,
@@ -886,6 +912,7 @@ def to_next_iteration(
             >>> timestamp.copy(
             ...     iteration=timestamp.iteration + 1,
             ...     epoch_in_iteration=0,
+            ...     token_in_iteration=0,
             ...     batch_in_epoch=0,
             ...     sample_in_epoch=0,
             ...     token_in_epoch=0,
@@ -902,6 +929,7 @@ def to_next_iteration(
         return self.copy(
             iteration=self.iteration + 1,
             epoch_in_iteration=0,
+            token_in_iteration=0,
             batch_in_epoch=0,
             sample_in_epoch=0,
             token_in_epoch=0,
@@ -919,6 +947,7 @@ def copy(
         sample: Optional[Union[int, Time[int]]] = None,
         token: Optional[Union[int, Time[int]]] = None,
         epoch_in_iteration: Optional[Union[int, Time[int]]] = None,
+        token_in_iteration: Optional[Union[int, Time[int]]] = None,
         batch_in_epoch: Optional[Union[int, Time[int]]] = None,
         sample_in_epoch: Optional[Union[int, Time[int]]] = None,
         token_in_epoch: Optional[Union[int, Time[int]]] = None,
@@ -938,6 +967,7 @@ def copy(
             sample (int | Time[int], optional): The sample.
             token (int | Time[int], optional): The token.
             epoch_in_iteration (int | Time[int], optional): The epoch in the iteration.
+            token_in_iteration (int | Time[int], optional): The token in the iteration.
             batch_in_epoch (int | Time[int], optional): The batch in the epoch.
             sample_in_epoch (int | Time[int], optional): The sample in the epoch.
             token_in_epoch (int | Time[int], optional): The token in the epoch.
@@ -957,6 +987,7 @@ def copy(
             sample=sample if sample is not None else self.sample,
             token=token if token is not None else self.token,
             epoch_in_iteration=epoch_in_iteration if epoch_in_iteration is not None else self.epoch_in_iteration,
+            token_in_iteration=token_in_iteration if token_in_iteration is not None else self.token_in_iteration,
             batch_in_epoch=batch_in_epoch if batch_in_epoch is not None else self.batch_in_epoch,
             sample_in_epoch=sample_in_epoch if sample_in_epoch is not None else self.sample_in_epoch,
             token_in_epoch=token_in_epoch if token_in_epoch is not None else self.token_in_epoch,
@@ -975,6 +1006,7 @@ def __repr__(self) -> str:
             f'sample={int(self.sample)}, '
             f'token={int(self.token)}, '
             f'epoch_in_iteration={int(self.epoch_in_iteration)}, '
+            f'token_in_iteration={int(self.token_in_iteration)}, '
             f'batch_in_epoch={int(self.batch_in_epoch)}, '
             f'sample_in_epoch={int(self.sample_in_epoch)}, '
             f'token_in_epoch={int(self.token_in_epoch)}, '
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index ba455cd78d..4447698beb 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -2610,10 +2610,24 @@ def _train_loop(self) -> None:
 
                 self.engine.run_event(Event.BATCH_CHECKPOINT)
 
-                if self.state.timestamp >= self.state.max_duration:
+                if (
+                    self.state.timestamp >= self.state.max_duration or (
+                        self.state._iteration_length is not None and
+                        self.state.timestamp.token_in_iteration.unit == self.state._iteration_length.unit and
+                        self.state.timestamp.token_in_iteration >= self.state._iteration_length
+                    )
+                ):
                     # If max_duration is specified in batches, samples, or tokens, and
                     # and the max_duration is reached mid-epoch, then break out of the dataloader
                     # to finish the epoch early and finish training.
+
+                    # Increment iteration
+                    if (
+                        self.state._iteration_length is not None and
+                        self.state.timestamp.token_in_iteration.unit == self.state._iteration_length.unit and
+                        self.state.timestamp.token_in_iteration >= self.state._iteration_length
+                    ):
+                        self._increment_iteration()
                     finished_epoch_early = True
                     break
 
@@ -2649,12 +2663,10 @@ def _train_loop(self) -> None:
                 # Increment iteration
                 if (
                     self.state._iteration_length is not None and
-                    self.state.timestamp.epoch_in_iteration == self.state._iteration_length
+                    self.state.timestamp.epoch_in_iteration.unit == self.state._iteration_length.unit and
+                    self.state.timestamp.epoch_in_iteration >= self.state._iteration_length
                 ):
-                    self.state.previous_timestamp = self.state.timestamp
-                    self.state.timestamp = self.state.timestamp.to_next_iteration()
-                    self.engine.run_event(Event.ITERATION_END)
-                    self.engine.run_event(Event.ITERATION_CHECKPOINT)
+                    self._increment_iteration()
 
         # Log final time values
         self.logger.log_metrics({
@@ -3039,6 +3051,12 @@ def _train_microbatch(
 
         return microbatch_loss_dict
 
+    def _increment_iteration(self):
+        self.state.previous_timestamp = self.state.timestamp
+        self.state.timestamp = self.state.timestamp.to_next_iteration()
+        self.engine.run_event(Event.ITERATION_END)
+        self.engine.run_event(Event.ITERATION_CHECKPOINT)
+
     def predict(
         self,
         dataloader: Union[DataLoader, DataSpec],
@@ -3506,7 +3524,7 @@ def _eval_loop(
                                                 outputs.append(v)
                                     else:
                                         outputs = self.state.outputs.cpu()
-                                    batch = DeviceCPU().batch_to_device(self.state.batch,)
+                                    batch = DeviceCPU().batch_to_device(self.state.batch)
                                 else:
                                     outputs = self.state.outputs
                                     batch = self.state.batch
diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py
index ee53a36ff9..af0ca34961 100644
--- a/tests/checkpoint/test_state_dict.py
+++ b/tests/checkpoint/test_state_dict.py
@@ -568,6 +568,7 @@ def test_get_resumption_state_dict():
         'sample': 0,
         'token': 0,
         'epoch_in_iteration': 0,
+        'token_in_iteration': 0,
         'batch_in_epoch': 0,
         'sample_in_epoch': 0,
         'token_in_epoch': 0,
diff --git a/tests/test_time.py b/tests/test_time.py
index b5fad369d9..1545eaa3b1 100644
--- a/tests/test_time.py
+++ b/tests/test_time.py
@@ -151,7 +151,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     # Step batch 0 in epoch 0
     timestamp = timestamp.to_next_batch(10, 20, datetime.timedelta(seconds=5))
     assert timestamp.batch == 1
-    assert timestamp.batch_in_epoch == 1
+    assert timestamp.token_in_iteration == 20
     assert timestamp.batch_in_epoch == 1
     assert timestamp.sample == 10
     assert timestamp.sample_in_epoch == 10
@@ -163,9 +163,10 @@ def test_timestamp_to_next_batch_epoch_iteration():
     assert timestamp.batch_wct == datetime.timedelta(seconds=5)
 
     # Finish epoch 0
-    timestamp = timestamp.to_next_epoch(datetime.timedelta(seconds=5))
+    timestamp = timestamp.to_next_epoch(duration=datetime.timedelta(seconds=5))
     assert timestamp.epoch == 1
     assert timestamp.batch == 1
+    assert timestamp.token_in_iteration == 20
     assert timestamp.batch_in_epoch == 0
     assert timestamp.sample == 10
     assert timestamp.sample_in_epoch == 0
@@ -181,6 +182,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     assert timestamp.epoch == 1
     assert timestamp.batch == 2
     assert timestamp.epoch_in_iteration == 1
+    assert timestamp.token_in_iteration == 20
     assert timestamp.batch_in_epoch == 1
     assert timestamp.sample == 15
     assert timestamp.sample_in_epoch == 5
@@ -195,6 +197,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     timestamp = timestamp.to_next_batch(5, 1, datetime.timedelta(seconds=10))
     assert timestamp.epoch == 1
     assert timestamp.batch == 3
+    assert timestamp.token_in_iteration == 21
     assert timestamp.batch_in_epoch == 2
     assert timestamp.sample == 20
     assert timestamp.sample_in_epoch == 10
@@ -210,6 +213,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     assert timestamp.epoch == 2
     assert timestamp.batch == 3
     assert timestamp.epoch_in_iteration == 2
+    assert timestamp.token_in_iteration == 21
     assert timestamp.batch_in_epoch == 0
     assert timestamp.sample == 20
     assert timestamp.sample_in_epoch == 0
@@ -224,6 +228,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     assert timestamp.epoch == 2
     assert timestamp.batch == 4
     assert timestamp.epoch_in_iteration == 2
+    assert timestamp.token_in_iteration == 22
     assert timestamp.batch_in_epoch == 1
     assert timestamp.sample == 25
     assert timestamp.sample_in_epoch == 5
@@ -240,6 +245,7 @@ def test_timestamp_to_next_batch_epoch_iteration():
     assert timestamp.epoch == 2
     assert timestamp.batch == 4
     assert timestamp.epoch_in_iteration == 0
+    assert timestamp.token_in_iteration == 0
     assert timestamp.batch_in_epoch == 0
     assert timestamp.sample == 25
     assert timestamp.sample_in_epoch == 0

From 9500fd17d809d364af85911aefa92d621451399c Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 13 Jun 2024 17:56:05 -0700
Subject: [PATCH 12/69] Busy wait utils in dist (#3396)

---
 composer/utils/dist.py   | 73 ++++++++++++++++++++++++++++++++++++++++
 tests/utils/test_dist.py | 47 ++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 573e940bb9..95a95835f4 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -37,6 +37,8 @@
 import logging
 import os
 import pickle
+import random
+import string
 import sys
 import time
 from contextlib import contextmanager
@@ -627,6 +629,77 @@ def get_sampler(
     )
 
 
+def get_node_signal_file_name(rng: Optional[random.Random] = None) -> str:
+    """Returns a file name to use for a file based wait within a node.
+
+    The file name will contain a randomly generated string to avoid conflicts.
+    Note: This file name will be the same on each node, so that it can be used for a file based wait.
+
+    Returns:
+        str: The name of the file that will be created to signal the end of a node's training.
+    """
+    if rng is None:
+        rng = random.Random()
+
+    random_string = ''.join(rng.choices(string.ascii_letters + string.digits, k=6))
+    node_rank = get_node_rank()
+    file_name_list = [f'._signal_file_node{node_rank}_{random_string}']
+    dist.broadcast_object_list(file_name_list, src=0)
+    return file_name_list[0]
+
+
+def write_signal_file(signal_file_name: str, dir_path: Optional[str] = None) -> str:
+    """Writes a signal file to the specified directory.
+
+    This function creates a signal file in the specified directory. If the directory does
+    Note: Only local rank zero writes the signal file. All other ranks are expected to wait for the signal file.
+
+    Args:
+        signal_file_name (str): The name of the signal file.
+        dir_path (str, optional): The full path to the directory in which to create the signal file. If ``None``,
+            the current working directory will be used.
+    """
+    if dir_path is not None:
+        os.makedirs(dir_path, exist_ok=True)
+
+    signal_file_path = os.path.join(dir_path or os.getcwd(), signal_file_name)
+    if get_local_rank() == 0:
+        with open(signal_file_path, 'w') as _f:
+            _f.write('local rank zero done')
+
+    return signal_file_path
+
+
+@contextmanager
+def busy_wait_for_local_rank_zero(dir_path: Optional[str] = None):
+    """Busy waits for the signal file to be created by local rank zero.
+
+    This function will wait for the signal file to be created by local rank zero. It will
+    check every 0.1 seconds for the existence of the file.
+
+    Args:
+        dir_path (str, optional): The directory in which to look for the signal file. If ``None``,
+            the current working directory will be used.
+    """
+    # Get unique file name
+    signal_file_name = get_node_signal_file_name()
+
+    # All ranks yield execution to allow local rank zero to run the code it needs to
+    yield
+
+    # Local rank zero writes the signal file, all other rank just get the expected path
+    signal_file_path = write_signal_file(signal_file_name=signal_file_name, dir_path=dir_path)
+
+    # Wait for the signal file to be created by local rank zero
+    with local_rank_zero_download_and_wait(signal_file_path):
+        # Sync all ranks across nodes as busy wait only is within node
+        dist.barrier()
+
+    # Remove the signal file
+    if get_local_rank() == 0:
+        os.remove(signal_file_path)
+
+
 @contextmanager
 def local_rank_zero_download_and_wait(expected_file_path: str):
     """Context manager to wait for a file to exist on all ranks except local rank zero.
diff --git a/tests/utils/test_dist.py b/tests/utils/test_dist.py
index 44aedecf3d..608e56e5d2 100644
--- a/tests/utils/test_dist.py
+++ b/tests/utils/test_dist.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import time
 from unittest.mock import patch
 
 import pytest
@@ -27,3 +29,48 @@ def test_run_local_rank_first_context_runs_properly():
     # so dist is initialized here and this code should run without error
     with dist.run_local_rank_zero_first():
         pass
+
+
+@pytest.mark.world_size(2)
+def test_get_node_signal_file_name():
+    file_name = dist.get_node_signal_file_name()
+    gathered_file_names = dist.all_gather_object(file_name)
+
+    assert len(gathered_file_names) == 2
+    assert gathered_file_names[0] == gathered_file_names[1]
+    assert gathered_file_names[0] == file_name
+    assert file_name.startswith('._signal_file_node0_')
+    assert len(file_name) == len('._signal_file_node0_') + 6
+
+
+@pytest.mark.world_size(2)
+def test_write_signal_file(tmp_path):
+    file_name = dist.get_node_signal_file_name()
+    file_path = os.path.join(tmp_path, file_name)
+    dist.write_signal_file(file_name, tmp_path)
+
+    # tmp_path will be different on each rank, and only rank zero
+    # should have written a file
+    if dist.get_local_rank() == 0:
+        assert os.path.exists(file_path)
+    else:
+        assert not os.path.exists(file_path)
+
+
+@pytest.mark.world_size(2)
+def test_busy_wait_for_local_rank_zero(tmp_path):
+    gathered_tmp_path = dist.all_gather_object(tmp_path)[0]
+
+    dist.barrier()
+    start_time = time.time()
+    assert os.listdir(gathered_tmp_path) == []
+    with dist.busy_wait_for_local_rank_zero(gathered_tmp_path):
+        if dist.get_local_rank() == 0:
+            time.sleep(0.5)
+
+    end_time = time.time()
+    total_time = end_time - start_time
+    gathered_times = dist.all_gather_object(total_time)
+    assert os.listdir(gathered_tmp_path) == []
+    assert len(gathered_times) == 2
+    assert abs(gathered_times[0] - gathered_times[1]) < 0.1

From a1c581d86157a1b3d20886ca1e1c3433ed922adc Mon Sep 17 00:00:00 2001
From: Chen Qian <chen.qian@databricks.com>
Date: Fri, 14 Jun 2024 13:17:45 -0700
Subject: [PATCH 13/69] Add buffering time to mlflow logger (#3401)

* Add buffering time to mlflow logger

* rename

* change default and fix comments
---
 composer/loggers/mlflow_logger.py   |  7 ++++++
 tests/loggers/test_mlflow_logger.py | 37 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 92b3fc2657..9a64ef5d9d 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -66,6 +66,9 @@ class MLFlowLogger(LoggerDestination):
         resume (bool, optional): If ``True``, Composer will search for an existing run tagged with
             the `run_name` and resume it. If no existing run is found, a new run will be created.
             If ``False``, Composer will create a new run. (default: ``False``)
+        logging_buffer_seconds (int, optional): The amount of time, in seconds, that MLflow
+            waits before sending logs to the MLflow tracking server. Metrics/params/tags logged
+            within this buffer time will be grouped in batches before being sent to the backend.
     """
 
     def __init__(
@@ -85,6 +88,7 @@ def __init__(
         ignore_hyperparameters: Optional[list[str]] = None,
         run_group: Optional[str] = None,
         resume: bool = False,
+        logging_buffer_seconds: Optional[int] = 10,
     ) -> None:
         try:
             import mlflow
@@ -116,6 +120,9 @@ def __init__(
                 )
         self.resume = resume
 
+        if logging_buffer_seconds:
+            os.environ['MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'] = str(logging_buffer_seconds)
+
         self._rank_zero_only = rank_zero_only
         self._last_flush_time = time.time()
         self._flush_interval = flush_interval
diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py
index 61d52d8023..4fe221f52d 100644
--- a/tests/loggers/test_mlflow_logger.py
+++ b/tests/loggers/test_mlflow_logger.py
@@ -798,6 +798,43 @@ def test_rename_metrics(self, device, num_batches, tmp_path):
         assert not os.path.exists(metric_file)
 
 
+def test_mlflow_logging_time_buffer(tmp_path):
+    mlflow = pytest.importorskip('mlflow')
+    if not hasattr(mlflow.environment_variables, 'MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'):
+        pytest.skip('MLFlow {mlflow.__version__} does not support async logging buffer seconds.')
+
+    with patch('mlflow.store.tracking.file_store.FileStore.log_batch') as mock_log_batch:
+
+        mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
+        experiment_name = 'mlflow_logging_test'
+        mock_state = MagicMock()
+        mock_logger = MagicMock()
+
+        test_mlflow_logger = MLFlowLogger(
+            tracking_uri=mlflow_uri,
+            experiment_name=experiment_name,
+            log_system_metrics=True,
+            run_name='test_run',
+            logging_buffer_seconds=2,
+        )
+        test_mlflow_logger.init(state=mock_state, logger=mock_logger)
+        test_mlflow_logger.log_hyperparameters({'name': 'test'})
+        steps = 10
+        for i in range(steps):
+            metrics = {
+                'foo': i,
+                'bar': i,
+            }
+            test_mlflow_logger.log_metrics(metrics, step=i)
+        test_mlflow_logger.post_close()
+
+    # There will be 2 calls to `log_batch`, one from `start_run` with tags, and one from the metrics
+    # and hyperparameters logging.
+    assert mock_log_batch.call_count == 2
+    assert len(mock_log_batch.call_args_list[0][1]['metrics']) == 0
+    assert len(mock_log_batch.call_args_list[1][1]['metrics']) == 2 * steps
+
+
 def test_mlflow_resume_run(tmp_path):
     mlflow = pytest.importorskip('mlflow')
 

From fffa33571e5d5c8bc9b3e8ecdf97f75d74ce39c8 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 14 Jun 2024 13:36:53 -0700
Subject: [PATCH 14/69] Update _patch_pytorch.py (#3402)

---
 composer/trainer/_patch_pytorch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 5e59849d45..6771c5db4b 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -933,6 +933,8 @@ def device_mesh__getitem__(self, mesh_dim_names: Union[str, tuple[str]]) -> 'Dev
             return submesh
 
     else:
+        from torch.distributed.device_mesh import _mesh_resources
+
         def create_child_mesh(
             self, parent_mesh: 'DeviceMesh', submesh_dim_names: Tuple[str, ...],
         ) -> 'DeviceMesh':

From 3e1396eb18b33ccb387408bf436c672f50466b69 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 17 Jun 2024 08:52:48 -0700
Subject: [PATCH 15/69] Add pynvml to mlflow dep group (#3404)

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 2eba4e39e6..5beb4d5136 100644
--- a/setup.py
+++ b/setup.py
@@ -225,6 +225,7 @@ def package_files(prefix: str, directory: str, extension: str):
 extra_deps['mlflow'] = [
     'mlflow>=2.11.1,<3.0',
     'databricks-sdk==0.28.0',
+    'pynvml>=11.5.0,<12',
 ]
 
 extra_deps['pandas'] = ['pandas>=2.0.0,<3.0']

From 0eb1eee6c0dc94426cd423903e9cb0d4dc89a5ad Mon Sep 17 00:00:00 2001
From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com>
Date: Mon, 17 Jun 2024 09:10:43 -0700
Subject: [PATCH 16/69] min/max flagging added to system_metrics_monitor with
 only non-redundant, necessary gpu metrics logged (#3373)

* implemented min_max flag

* fixed string parsing

* refactoring compute_system_metrics for all_reduce

* keep track of rank within dict

* added compute_min_max

* added flag for both min_max and all_logging

* corrected min_max call with model_device

* removing total bytes (always going ot be constant)

* handled no gpu case in min_max flag

* removed unnecessary imports, patched unit tests

* fixed assert statement for with gpu case, world size 1

* case min_rank and max_rank as int to guarantee them working as indices

* fixed indent issue from fixing font

* made docs more concise and readable

* fixing unexpected unindent

* fixing unit test device

* modifying device to equal model_device.type

* reverting to device=model_device

* setting device in unit test = 'gpu'

* setting device = 'cuda' in unit testing

* reverting to next(state.model.parameters()).device

* removed torch as a dependecy for unit_testing

* cleaned up UI to be consistent + removed calling next to obtain device

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
Co-authored-by: Charles Tang <j316chuck@users.noreply.github.com>
---
 composer/callbacks/system_metrics_monitor.py  | 93 ++++++++++++++++---
 .../callbacks/test_system_metrics_monitor.py  |  4 +-
 2 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/composer/callbacks/system_metrics_monitor.py b/composer/callbacks/system_metrics_monitor.py
index 292e31e57b..bdd2cebce2 100644
--- a/composer/callbacks/system_metrics_monitor.py
+++ b/composer/callbacks/system_metrics_monitor.py
@@ -9,6 +9,7 @@
 import os
 
 import psutil
+import torch
 
 from composer.core import Callback, Event, State
 from composer.loggers import Logger
@@ -19,13 +20,52 @@
 
 __all__ = ['SystemMetricsMonitor']
 
+_GPU_METRICS = [
+    'gpu_percentage',
+    'memory_percentage',
+    'gpu_temperature_C',
+    'gpu_power_usage_W',
+]
+
 
 class SystemMetricsMonitor(Callback):
-    """Track system metrics."""
+    """Logs GPU/CPU metrics.
+
+    GPU Metrics:
+        gpu_percentage: Occupancy rate, percent of time over sampling period during which one or more kernels was executing on the GPU.
+        memory_percentage: Percent of time over sampling period during which global memory was being read or written.
+        gpu_temperature_C: Temperature of device, in Celcius.
+        gpu_power_usage_W: Power usage of device, in Watts.
+
+    By default, only the maximum and minimum values for these metrics, alongside their respective ranks in the key names,
+    are logged on the :attr:`.Event.BATCH_START`, :attr:`.Event.EVAL_BATCH_START`, :attr:`.Event.PREDICT_BATCH_START`
+    events for every batch. If log_all_data is set to True, all values for these metrics across all ranks are logged on the
+    above events for every batch.
+
+    Example:
+    .. doctest::
 
-    def __init__(self, gpu_available: bool = False) -> None:
+        >>> from composer import Trainer
+        >>> from composer.callbacks import SystemMetricsMonitor
+        >>> # constructing trainer object with this callback
+        >>> trainer = Trainer(
+        ...    model=model,
+        ...    train_dataloader=train_dataloader,
+        ...    eval_dataloader=eval_dataloader,
+        ...    optimizers=optimizer,
+        ...    max_duration='1ep',
+        ...    callbacks=[SystemMetricsMonitor()],
+        ... )
+
+    Args:
+        log_all_data (bool, optional): True if user wants to log data for all ranks, not just the min/max.
+            Defaults to False.
+    """
+
+    def __init__(self, log_all_data: bool = False) -> None:
         super().__init__()
-        self.gpu_available = gpu_available
+        self.gpu_available = torch.cuda.is_available()
+        self.log_all_data = log_all_data
         if self.gpu_available:
             try:
                 import pynvml
@@ -46,9 +86,23 @@ def run_event(self, event: Event, state: State, logger: Logger):
         ]:
             local_node_system_metrics = self.compute_system_metrics()
             all_system_metrics = dist.all_gather_object(local_node_system_metrics)
-            system_metrics = {
-                key: value for local_metrics in all_system_metrics for key, value in local_metrics.items()
-            }
+            system_metrics = {}
+
+            if self.log_all_data:
+                for rank, metrics in enumerate(all_system_metrics):
+                    for key, value in metrics.items():
+                        if key in _GPU_METRICS:
+                            system_metrics[f'{key}_rank_{rank}'] = value
+                        else:
+                            system_metrics[key] = value
+
+            else:
+                system_metrics = self.compute_gpu_min_max_metrics(all_system_metrics, state)
+                for rank, metrics in enumerate(all_system_metrics):
+                    for key, value in metrics.items():
+                        if key not in _GPU_METRICS:
+                            system_metrics[key] = value
+
             logger.log_metrics(system_metrics)
 
     def compute_system_metrics(self):
@@ -58,17 +112,14 @@ def compute_system_metrics(self):
         if self.gpu_available:
             import pynvml
             local_rank = dist.get_local_rank()
-            global_rank = dist.get_global_rank()
             handle = pynvml.nvmlDeviceGetHandleByIndex(local_rank)
-            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            system_metrics[f'device{global_rank}_memory_total'] = memory.total
-            system_metrics[f'device{global_rank}_memory_free'] = memory.free
-            system_metrics[f'device{global_rank}_memory_used'] = memory.used
             device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-            system_metrics[f'device{global_rank}_gpu_percentage'] = device_utilization.gpu
-            system_metrics[f'device{global_rank}_memory_percentage'] = device_utilization.memory
+            system_metrics['gpu_percentage'] = device_utilization.gpu
+            system_metrics['memory_percentage'] = device_utilization.memory
             temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
-            system_metrics[f'device{global_rank}_gpu_temperature'] = temperature
+            system_metrics['gpu_temperature_C'] = temperature
+            power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # convert from mW to W
+            system_metrics['gpu_power_usage_W'] = power
 
         # Get metrics for the system
         cpu_percent = psutil.cpu_percent()
@@ -83,3 +134,17 @@ def compute_system_metrics(self):
         for k, v in network_usage.items():
             system_metrics[f'network_{k}'] = v
         return system_metrics
+
+    def compute_gpu_min_max_metrics(self, all_metrics, state):
+        min_max_metrics = {}
+
+        if self.gpu_available:
+            for key in _GPU_METRICS:
+                values = torch.tensor([metrics_for_cur_rank[key] for metrics_for_cur_rank in all_metrics])
+                values = state.device.tensor_to_device(values)
+                min_rank = int(torch.argmin(values).item())
+                max_rank = int(torch.argmax(values).item())
+                min_max_metrics[f'min_{key}_rank_{min_rank}'] = values[min_rank].item()
+                min_max_metrics[f'max_{key}_rank_{max_rank}'] = values[max_rank].item()
+
+        return min_max_metrics
diff --git a/tests/callbacks/test_system_metrics_monitor.py b/tests/callbacks/test_system_metrics_monitor.py
index a26d02ba93..c974f6cbed 100644
--- a/tests/callbacks/test_system_metrics_monitor.py
+++ b/tests/callbacks/test_system_metrics_monitor.py
@@ -13,7 +13,7 @@
 @pytest.mark.gpu
 def test_system_metrics_monitor_gpu():
     # Construct the trainer
-    system_metrics_monitor = SystemMetricsMonitor(gpu_available=True)
+    system_metrics_monitor = SystemMetricsMonitor()
     in_memory_logger = InMemoryLogger()
     trainer = Trainer(
         model=SimpleModel(),
@@ -24,7 +24,7 @@ def test_system_metrics_monitor_gpu():
     )
     trainer.fit()
 
-    assert 'device0_gpu_percentage' in in_memory_logger.data
+    assert 'min_gpu_percentage_rank_0' in in_memory_logger.data
     assert 'cpu_percentage' in in_memory_logger.data
 
 

From 0ee83f78db4d645819fed51b91a69aee6c8fc2df Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 17 Jun 2024 10:57:49 -0700
Subject: [PATCH 17/69] simplify launcher (#3398)

---
 composer/cli/launcher.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index 08dd7b3921..91110c2add 100755
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -197,8 +197,13 @@ def _parse_args():
     if args.nproc < 1:
         raise ValueError('The nproc must be 1 or greater')
 
-    if args.world_size is None and 'WORLD_SIZE' in os.environ:
-        args.world_size = int(os.environ['WORLD_SIZE'])
+    if args.world_size is None:
+        if 'WORLD_SIZE' in os.environ and os.environ.get('LOCAL_WORLD_SIZE') != os.environ['WORLD_SIZE']:
+            # Use WORLD_SIZE env var if set and running multinode. Otherwise, default to nproc
+            # to enable easy overriding of number of processes when on a single node.
+            args.world_size = int(os.environ['WORLD_SIZE'])
+        else:
+            args.world_size = args.nproc
 
     if args.base_rank is None and 'BASE_RANK' in os.environ:
         args.base_rank = int(os.environ['BASE_RANK'])
@@ -212,9 +217,6 @@ def _parse_args():
     if args.master_port is None and 'MASTER_PORT' in os.environ:
         args.master_port = int(os.environ['MASTER_PORT'])
 
-    if args.world_size is None:
-        args.world_size = args.nproc
-
     if args.world_size < args.nproc:
         raise ValueError(f'world_size({args.world_size}) cannot be less than nproc({args.nproc})')
 

From 04ba0b67843247ae001ec5a2a2b495251bece057 Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <saaketh@mosaicml.com>
Date: Mon, 17 Jun 2024 12:09:55 -0700
Subject: [PATCH 18/69] Optionally use `flash-attn`'s CE loss for metrics
 (#3394)

* yo

* slam

* cuda

* cuda checks

* test

* fix_test

* gloo

* gloo

* lint

* lint

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 .github/workflows/pr-cpu.yaml       |  2 +-
 composer/devices/device_gpu.py      |  3 +
 composer/metrics/nlp.py             | 22 ++++++-
 tests/checkpoint/test_state_dict.py |  6 +-
 tests/metrics/test_nlp_metrics.py   | 89 +++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 1bdb383823..12f471749e 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -22,7 +22,7 @@ jobs:
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-doctest
diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py
index 19cb0a774a..401368576e 100644
--- a/composer/devices/device_gpu.py
+++ b/composer/devices/device_gpu.py
@@ -12,6 +12,7 @@
 import torch.backends.cudnn
 import torch.cuda
 import torch.cuda.amp
+import torch.distributed as torch_dist
 import torch.utils.data
 
 from composer.devices.device import Device
@@ -42,6 +43,8 @@ def __init__(
     ):
         if not torch.cuda.is_available():
             raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.')
+        if torch_dist.is_gloo_available():
+            DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo'
         if device_id is None:
             device_id = dist.get_local_rank()
         self._device = torch.device(f'cuda:{device_id}')
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index e6877292cf..c1562e5936 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -83,7 +83,21 @@ def __init__(self, dist_sync_on_step: bool = False, ignore_index: int = -100):
         super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self.ignore_index = ignore_index
-        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
+        self.flash_loss_fn = None
+        try:
+            from flash_attn.losses.cross_entropy import CrossEntropyLoss as FusedCrossEntropyLoss
+            log.debug(
+                'Found `flash_attn` installation. Using CrossEntropyLoss from `flash_attn`' +
+                'to compute LanguageCrossEntropy metric for CUDA tensors, which will be faster.',
+            )
+            self.flash_loss_fn = FusedCrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
+        except ImportError:
+            if torch.cuda.is_available():
+                log.debug(
+                    'Package `flash_attn` not installed. Using torch.nn.CrossEntropyLoss ' +
+                    'to compute LanguageCrossEntropy metric for CUDA tensors, which will be slower.',
+                )
+        self.torch_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
         self.add_state('sum_loss', default=torch.tensor(0.), dist_reduce_fx='sum')
         self.add_state('total_items', default=torch.tensor(0), dist_reduce_fx='sum')
 
@@ -104,7 +118,11 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None:
 
         target = target.view(-1)
         logits = logits.view(target.shape[0], -1)
-        losses = self.loss_fn(logits, target)
+        # Use Flash attn's CE loss function, if available, if inputs are both CUDA tensors.
+        if self.flash_loss_fn is not None and target.is_cuda and logits.is_cuda:
+            losses = self.flash_loss_fn(logits, target)
+        else:
+            losses = self.torch_loss_fn(logits, target)
 
         total_items = (target != self.ignore_index).sum()
         self.total_items += total_items  #type: ignore (third-party)
diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py
index af0ca34961..bd14154dc9 100644
--- a/tests/checkpoint/test_state_dict.py
+++ b/tests/checkpoint/test_state_dict.py
@@ -7,6 +7,7 @@
 
 import pytest
 import torch
+import torch.distributed as torch_dist
 from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.optim import adam
@@ -530,7 +531,10 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz
         assert 'model_name' in metadata_sd
 
     assert 'dist_backend' in metadata_sd
-    assert metadata_sd['dist_backend'] == 'nccl'
+    if torch_dist.is_gloo_available():
+        assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo'
+    else:
+        assert metadata_sd['dist_backend'] == 'nccl'
 
 
 @pytest.mark.filterwarnings('ignore:SWA has')
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index 7fe854bd96..9b198003d3 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -14,6 +14,7 @@
     LanguagePerplexity,
     MaskedAccuracy,
 )
+from tests.common import device
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
@@ -50,12 +51,100 @@ def test_masked_accuracy(ignore_index, num_classes):
     assert abs(final_acc - (1.0 / num_classes)) < 0.02
 
 
+@device('cpu', 'gpu')
 @pytest.mark.parametrize('ignore_index', [-100])
 @pytest.mark.parametrize('batch_size', [1e2, 1e3])
 @pytest.mark.parametrize('sequence_length', [128])
 @pytest.mark.parametrize('num_classes', [2, 10])
 @pytest.mark.parametrize('minibatch_size', [56, 256, 768])
+@pytest.mark.parametrize('tensor_device', ['cpu', 'gpu'])
 def test_cross_entropy(
+    device: str,
+    batch_size: float,
+    ignore_index: Optional[int],
+    sequence_length: int,
+    num_classes: int,
+    minibatch_size: int,
+    tensor_device: str,
+):
+    """Sanity check to make sure that batched CrossEntropyLoss matches the expected performance.
+
+    Generates a predicted distribution from a normal distribution, and a ground truth from a normal distribution.
+    Verifies Cross Entropy Loss against the baseline performance.
+
+    Args:
+        device (str): the device to run the test on
+        batch_size (int): how many samples are in each batch
+        ignore_index (Optional[int]): if present, the class index to ignore in accuracy calculations.
+        sequence_length (int): the length of the generated sequence
+        num_classes (int): the number of classes in the classification task
+        minibatch_size (int): the minibatch size to simulate for model predictions
+        tensor_device (str): which device the input tensors to the metric are on
+    """
+
+    if device == 'cpu' and tensor_device == 'gpu':
+        pytest.skip('Skipping test that would try to use GPU tensors when only CPU is available.')
+
+    batch_size = int(batch_size)
+    generated_preds = torch.randn((batch_size, sequence_length, num_classes))
+    generated_true = torch.randint(low=0, high=num_classes, size=(batch_size, sequence_length))
+
+    assert ignore_index is not None
+    torchmetrics_xent = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
+    ce_with_keys_metric = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
+
+    if tensor_device == 'cpu':
+        torchmetrics_xent = torchmetrics_xent.to('cpu')
+        ce_with_keys_metric = ce_with_keys_metric.to('cpu')
+    elif tensor_device == 'gpu':
+        torchmetrics_xent = torchmetrics_xent.to('cuda')
+        ce_with_keys_metric = ce_with_keys_metric.to('cuda')
+
+    if device == 'gpu':
+        assert torchmetrics_xent.flash_loss_fn is not None
+
+    labels_mask = torch.rand((batch_size, sequence_length))
+    labels_mask[labels_mask > 0.8] = 1
+    labels_mask[labels_mask <= 0.8] = 0
+    labels_mask = labels_mask.bool()
+    generated_true[labels_mask] = ignore_index
+
+    num_batches = math.ceil(batch_size / minibatch_size)
+    for batch_idx in range(num_batches):
+        begin_idx = (batch_idx * minibatch_size)
+        end_idx = ((batch_idx + 1) * minibatch_size)
+        preds_subset = generated_preds[begin_idx:end_idx]
+        true_subset = generated_true[begin_idx:end_idx]
+
+        if tensor_device == 'cpu':
+            preds_subset = preds_subset.cpu()
+            true_subset = true_subset.cpu()
+        elif tensor_device == 'gpu':
+            preds_subset = preds_subset.cuda()
+            true_subset = true_subset.cuda()
+
+        torchmetrics_xent.update(preds_subset, true_subset)
+        ce_with_keys_metric.update(
+            {
+                'logits': preds_subset.view(-1, num_classes),
+                'loss': cross_entropy(preds_subset.view(-1, num_classes), true_subset.view(-1)),
+            },
+            true_subset.view(-1),
+        )
+
+    torchmetrics_loss = torchmetrics_xent.compute()
+    ce_with_keys_loss = ce_with_keys_metric.compute()
+    correct_loss = cross_entropy(generated_preds.view(-1, num_classes), generated_true.view(-1))
+    assert torchmetrics_loss == ce_with_keys_loss
+    assert torch.isclose(correct_loss, torchmetrics_loss)
+
+
+@pytest.mark.parametrize('ignore_index', [-100])
+@pytest.mark.parametrize('batch_size', [1e2, 1e3])
+@pytest.mark.parametrize('sequence_length', [128])
+@pytest.mark.parametrize('num_classes', [2, 10])
+@pytest.mark.parametrize('minibatch_size', [56, 256, 768])
+def test_torch_cpu_cross_entropy(
     batch_size: float,
     ignore_index: Optional[int],
     sequence_length: int,

From 1dfd3bc999ad839f1bd83ecfecf01832e8965ccb Mon Sep 17 00:00:00 2001
From: Jesse Chan <jesse.chan@databricks.com>
Date: Mon, 17 Jun 2024 12:30:39 -0700
Subject: [PATCH 19/69] log image fix (#3286)

* log image fix

Signed-off-by: Jesse Chan <jesse.chan@databricks.com>

* fixed log image tests

Signed-off-by: Jesse Chan <jesse.chan@databricks.com>

* linter

Signed-off-by: Jesse Chan <jesse.chan@databricks.com>

* add simd requirement

* post0?

* versioning yada yada yada

* guh

* import fix?

* update deps

* fix

* fix II

* remove other dependency

* debug statement, remove

* post1?!

* build from source

* whitespace?

* use pillow

* delete a unit test and ignore some types

* s/type/pyright

* formatting

* formatting

* ignore more stuff

* Apply suggestions from code review

* remove rest

* Update setup.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* try no ignore

* remove intenum

---------

Signed-off-by: Jesse Chan <jesse.chan@databricks.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
Co-authored-by: Milo Cress <milo.cress@databricks.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 .../utils/augmentation_primitives.py          | 31 ++++++++++++++++---
 composer/loggers/mlflow_logger.py             |  3 +-
 setup.py                                      |  2 +-
 tests/loggers/test_mlflow_logger.py           |  5 +--
 tests/test_docker.py                          | 18 -----------
 5 files changed, 32 insertions(+), 27 deletions(-)
 delete mode 100644 tests/test_docker.py

diff --git a/composer/algorithms/utils/augmentation_primitives.py b/composer/algorithms/utils/augmentation_primitives.py
index d2b2417e5c..d3ac807c02 100644
--- a/composer/algorithms/utils/augmentation_primitives.py
+++ b/composer/algorithms/utils/augmentation_primitives.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 from PIL import Image, ImageEnhance, ImageOps
+from PIL.Image import Resampling, Transform
 
 AugmentationFn = Callable[[Image.Image, float], Image.Image]
 
@@ -155,7 +156,7 @@ def rotate(pil_img: Image.Image, level: float):
     degrees = _int_parameter(_sample_level(level), 30)
     if np.random.uniform() > 0.5:
         degrees = -degrees
-    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+    return pil_img.rotate(degrees, resample=Resampling.BILINEAR)
 
 
 def solarize(pil_img: Image.Image, level: float):
@@ -183,7 +184,12 @@ def shear_x(pil_img: Image.Image, level: float):
     level = _float_parameter(_sample_level(level), 0.3)
     if np.random.uniform() > 0.5:
         level = -level
-    return pil_img.transform(pil_img.size, Image.AFFINE, (1, level, 0, 0, 1, 0), resample=Image.BILINEAR)
+    return pil_img.transform(
+        pil_img.size,
+        Transform.AFFINE,
+        (1, level, 0, 0, 1, 0),
+        resample=Resampling.BILINEAR,
+    )
 
 
 def shear_y(pil_img: Image.Image, level: float):
@@ -197,7 +203,12 @@ def shear_y(pil_img: Image.Image, level: float):
     level = _float_parameter(_sample_level(level), 0.3)
     if np.random.uniform() > 0.5:
         level = -level
-    return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, 0, level, 1, 0), resample=Image.BILINEAR)
+    return pil_img.transform(
+        pil_img.size,
+        Transform.AFFINE,
+        (1, 0, 0, level, 1, 0),
+        resample=Resampling.BILINEAR,
+    )
 
 
 def translate_x(pil_img: Image.Image, level: float):
@@ -211,7 +222,12 @@ def translate_x(pil_img: Image.Image, level: float):
     level = _int_parameter(_sample_level(level), pil_img.size[0] / 3)
     if np.random.random() > 0.5:
         level = -level
-    return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, level, 0, 1, 0), resample=Image.BILINEAR)
+    return pil_img.transform(
+        pil_img.size,
+        Transform.AFFINE,
+        (1, 0, level, 0, 1, 0),
+        resample=Resampling.BILINEAR,
+    )
 
 
 def translate_y(pil_img: Image.Image, level: float):
@@ -225,7 +241,12 @@ def translate_y(pil_img: Image.Image, level: float):
     level = _int_parameter(_sample_level(level), pil_img.size[1] / 3)
     if np.random.random() > 0.5:
         level = -level
-    return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, 0, 0, 1, level), resample=Image.BILINEAR)
+    return pil_img.transform(
+        pil_img.size,
+        Transform.AFFINE,
+        (1, 0, 0, 0, 1, level),
+        resample=Resampling.BILINEAR,
+    )
 
 
 # The following augmentations overlap with corruptions in the ImageNet-C/CIFAR10-C test
diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 9a64ef5d9d..03070c28f9 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -507,8 +507,9 @@ def log_images(
                 assert isinstance(self._run_id, str)
                 self._mlflow_client.log_image(
                     image=image,
-                    artifact_file=f'{name}_{step}_{im_ind}.png',
+                    key=f'{name}_{step}_{im_ind}',
                     run_id=self._run_id,
+                    step=step,
                 )
 
     def post_close(self):
diff --git a/setup.py b/setup.py
index 5beb4d5136..cbffa0b79c 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'cryptography==42.0.8',
     'pytest-httpserver>=1.0.4,<1.1',
     'setuptools<=59.5.0',
-    'pillow==9.3.0',  # Matches the Pillow version listed in the Dockerfile
+    'pillow>=10.3.0,<11',
 ]
 
 extra_deps['system_metrics_monitor'] = {
diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py
index 4fe221f52d..5ee6aab7a5 100644
--- a/tests/loggers/test_mlflow_logger.py
+++ b/tests/loggers/test_mlflow_logger.py
@@ -650,8 +650,9 @@ def before_forward(self, state: State, logger: Logger):
     experiment_id = run.info.experiment_id
 
     run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id)
-    im_dir = run_file_path / Path('artifacts')
-    assert len(os.listdir(im_dir)) == expected_num_ims
+    im_dir = run_file_path / Path('artifacts') / Path('images')
+    # 2 (compressed & uncompressed) per image, and two log images calls in ImageLogger
+    assert len(os.listdir(im_dir)) == expected_num_ims * 2 * 2
 
 
 @device('cpu')
diff --git a/tests/test_docker.py b/tests/test_docker.py
deleted file mode 100644
index 8a269d7563..0000000000
--- a/tests/test_docker.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import platform
-
-import PIL
-import pytest
-
-
-@pytest.mark.skipif(
-    'composer-python' not in os.environ['PATH'] or 'Linux' not in platform.system(),
-    reason='Pillow-simd test only checks if using the composer docker',
-)
-class TestDocker:
-
-    def test_pillow_simd(self):
-        assert 'post' in PIL.__version__, 'pillow-simd is not installed'

From f7e17de45a439e4cc84b87fab124a5c5d2ac93e4 Mon Sep 17 00:00:00 2001
From: Evan Racah <evan.racah@databricks.com>
Date: Mon, 17 Jun 2024 15:58:32 -0700
Subject: [PATCH 20/69] [ckpt-rewr] Save state dict API (#3372)

---
 composer/checkpoint/save.py         | 145 ++++++++++++++++++++++++++++
 docs/source/conf.py                 |   1 -
 tests/checkpoint/helpers.py         | 110 +++++++++++++++++++++
 tests/checkpoint/test_save.py       |  79 +++++++++++++++
 tests/checkpoint/test_state_dict.py | 107 ++------------------
 tests/common/compare.py             |  36 ++++++-
 6 files changed, 377 insertions(+), 101 deletions(-)
 create mode 100644 composer/checkpoint/save.py
 create mode 100644 tests/checkpoint/helpers.py
 create mode 100644 tests/checkpoint/test_save.py

diff --git a/composer/checkpoint/save.py b/composer/checkpoint/save.py
new file mode 100644
index 0000000000..72e5311d0f
--- /dev/null
+++ b/composer/checkpoint/save.py
@@ -0,0 +1,145 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Useful functions for saving state dicts to disk."""
+
+import logging
+import os
+import textwrap
+import warnings
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+import torch.distributed.checkpoint as DCP
+from packaging import version
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._tensor import DTensor
+
+from composer.utils import dist
+from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file
+
+log = logging.getLogger(__name__)
+
+
+def save_state_dict_to_disk(
+    state_dict: Dict[str, Any],
+    destination_file_path: str,
+    overwrite: bool = False,
+    save_format: str = 'pt',  # or hf, safetensor
+) -> Optional[str]:
+    """Saves a state dict to local disk.
+
+    Args:
+        state_dict (Dict[str,Any]): The state dict to save.
+        destination_file_path (str): The path to save the state dict to. If sharded,
+          this should be the pth to a directory. Otherwise, it should be a path to a file.
+        overwrite (bool): If True, the file will be overwritten if it exists.
+        save_format (str): The format to save the state dict in. One of 'pt', 'hf', or 'safetensor'.
+
+    Returns:
+        str: The full path to the saved state dict if (sharded is false and rank 0) or if sharded is true, otherwise None.
+    """
+    if state_dict == {}:
+        return None
+    if is_state_dict_sharded(state_dict):
+        path_saved = _save_sharded_state_dict_to_disk(state_dict, destination_file_path, overwrite, save_format)
+    else:
+        if dist.get_global_rank() == 0:
+            path_saved = _save_full_state_dict_to_disk(state_dict, destination_file_path, overwrite, save_format)
+        else:
+            path_saved = None
+
+    return path_saved
+
+
+def _save_sharded_state_dict_to_disk(
+    state_dict: Dict[str, Any],
+    destination_file_path: str,
+    overwrite: bool = False,
+    save_format: str = 'pt',
+) -> Optional[str]:
+
+    if save_format != 'pt':
+        raise NotImplementedError(
+            f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].",
+        )
+
+    if state_dict == {}:
+        return None
+
+    # If user specifies filename instead of directory suffixes, strip them and warn
+    if len(Path(destination_file_path).suffixes) > 0:
+        stripped_path = _strip_suffixes(destination_file_path)
+        warnings.warn(
+            textwrap.dedent(
+                f"""Sharded checkpoints require a directory path not a file path:
+            {destination_file_path} will have its extensions stripped and checkpoints will be saved in {stripped_path}
+            as {stripped_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}""",
+            ),
+        )
+        destination_file_path = stripped_path
+
+    if dist.get_global_rank() == 0 and not overwrite and os.path.exists(destination_file_path):
+        raise ValueError(f'Directory {destination_file_path} already exists. Set overwrite=True to overwrite it.')
+
+    log.debug(
+        f'Starting saving of sharded state dict to {destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}',
+    )
+
+    # For 2.3.0 and above you can use checkpoint_id, but this version works the best for all versions
+    # of torch (and makes pyright happier) that we support, so we use it for now.
+    if version.parse(torch.__version__) < version.parse('2.2.0'):
+        DCP.save_state_dict(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path))
+    else:
+        DCP.save(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path))
+
+    return destination_file_path + '/' + _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
+
+
+def _save_full_state_dict_to_disk(
+    state_dict: Dict[str, Any],
+    destination_file_path: str,
+    overwrite: bool = False,
+    save_format: str = 'pt',  # or hf, safetensor
+) -> Optional[str]:
+
+    if save_format != 'pt':
+        raise NotImplementedError(
+            f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].",
+        )
+
+    if not overwrite and os.path.exists(destination_file_path):
+        raise ValueError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite it.')
+
+    if dist.get_global_rank() == 0:
+        _write_checkpoint_file(state_dict=state_dict, filename=destination_file_path)
+        return destination_file_path
+    return None
+
+
+def is_state_dict_sharded(state_dict: Dict[str, Any]) -> bool:
+    """Determines if the state dict is sharded.
+
+    Args:
+        state_dict (Dict[str, Any]): The state dict to check.
+
+    Returns:
+        bool: Whether the state dict is sharded.
+    """
+    for value in state_dict.values():
+        if isinstance(value, ShardedTensor) or isinstance(value, DTensor):
+            return True
+        if isinstance(value, Dict):
+            is_sharded = is_state_dict_sharded(value)
+            if is_sharded:
+                return True
+    return False
+
+
+def _strip_suffixes(path: Union[str, Path]) -> str:
+    path = Path(path)
+    for _ in path.suffixes:
+        path = path.with_suffix('')
+
+    return str(path)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 45affa4a0e..533ce95b78 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -219,7 +219,6 @@ def _get_commit_sha() -> str:
     'torch': ('https://pytorch.org/docs/stable/', None),
     'torchvision': ('https://pytorch.org/vision/stable/', None),
     'torchtext': ('https://pytorch.org/text/stable/', None),
-    'torchmetrics': ('https://torchmetrics.readthedocs.io/en/latest/', None),
     'libcloud': ('https://libcloud.readthedocs.io/en/stable/', None),
     'PIL': ('https://pillow.readthedocs.io/en/stable', None),
     'coolname': ('https://coolname.readthedocs.io/en/latest/', None),
diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py
new file mode 100644
index 0000000000..047d30e813
--- /dev/null
+++ b/tests/checkpoint/helpers.py
@@ -0,0 +1,110 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict
+
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.api import CPUOffload
+from torch.optim import adam
+
+from tests.common.models import EvenSimplerMLP, SimpleComposerMLP
+
+__all__ = [
+    'init_model_and_optimizer',
+    'init_model',
+    'init_optimizer',
+]
+
+
+def init_model_and_optimizer(
+    use_composer_model: bool,
+    num_classes=3,
+    batch_size=5,
+    num_features=8,
+    take_step=True,
+    use_fsdp=False,
+    tensor_type='sharded_tensor',
+    device='cuda',
+):
+    model, loss_fn = init_model(
+        use_composer_model,
+        num_classes=num_classes,
+        num_features=num_features,
+        use_fsdp=use_fsdp,
+        tensor_type=tensor_type,
+        device=device,
+    )
+
+    optimizer = init_optimizer(
+        model,
+        loss_fn,
+        use_composer_model=use_composer_model,
+        num_classes=num_classes,
+        batch_size=batch_size,
+        num_features=num_features,
+        take_step=take_step,
+        device=device,
+    )
+
+    return model, optimizer
+
+
+def init_model(
+    use_composer_model: bool = False,
+    num_classes=3,
+    num_features=8,
+    use_fsdp=False,
+    device='cuda',
+    tensor_type='sharded_tensor',
+    sync_module_states=True,
+    cpu_offload=False,
+):
+    if use_composer_model:
+        model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device)
+        loss_fn = model._loss_fn
+    else:
+        model = EvenSimplerMLP(num_features=num_features, num_out_features=num_classes, device=device)
+        loss_fn = torch.nn.CrossEntropyLoss()
+
+    if use_fsdp:
+        fsdp_kwargs: Dict[str, Any] = dict(
+            use_orig_params=True,
+            sync_module_states=sync_module_states,  # To enable easy comparison between rank 0 unsharded model and full state dict
+            cpu_offload=CPUOffload(offload_params=True) if cpu_offload else None,
+            device_id=torch.device('cpu') if device == 'cpu' else None,
+        )
+
+        if tensor_type == 'dtensor':
+            from torch.distributed.device_mesh import init_device_mesh
+            device_mesh = init_device_mesh('cuda', (2,))
+            fsdp_kwargs['device_mesh'] = device_mesh
+
+        model = FSDP(
+            model,
+            **fsdp_kwargs,
+        )
+
+    return model, loss_fn
+
+
+def init_optimizer(
+    model,
+    loss_fn,
+    use_composer_model: bool = False,
+    num_classes=3,
+    batch_size=5,
+    num_features=8,
+    take_step=True,
+    device='cuda',
+):
+    inputs = torch.randn(batch_size, num_features, device=device)
+    targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long)
+    batch = (inputs, targets) if use_composer_model else inputs
+    optimizer = adam.Adam(model.parameters())
+    outputs = model(batch)
+    loss = loss_fn(outputs, targets)
+    loss.backward()
+    if take_step:
+        optimizer.step()
+    return optimizer
diff --git a/tests/checkpoint/test_save.py b/tests/checkpoint/test_save.py
new file mode 100644
index 0000000000..03b12bbcbc
--- /dev/null
+++ b/tests/checkpoint/test_save.py
@@ -0,0 +1,79 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+import uuid
+from copy import deepcopy
+from pathlib import Path
+
+import pytest
+import torch
+import torch.distributed.checkpoint as DCP
+from packaging import version
+
+from composer.checkpoint.save import save_state_dict_to_disk
+from composer.checkpoint.state_dict import get_model_state_dict
+from composer.utils import dist
+from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
+from tests.checkpoint.helpers import init_model
+from tests.common.compare import deep_compare
+from tests.common.markers import world_size
+
+
+@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.parametrize('sharded_model', [False, True])
+def test_save_full_state_dict_to_disk(world_size: int, tmp_path: str, sharded_model: bool):
+    if world_size == 1 and sharded_model:
+        pytest.skip("Can't have a sharded model for world_size = 1")
+    destination_file_path = os.path.join(tmp_path, 'test.pt')
+    use_fsdp = sharded_model
+    model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True)
+
+    state_dict = get_model_state_dict(model, sharded_state_dict=False)
+    path_saved = save_state_dict_to_disk(state_dict, destination_file_path=destination_file_path)
+    time.sleep(1)
+    if dist.get_global_rank() == 0:
+        assert path_saved is not None
+        assert path_saved == destination_file_path
+        assert os.path.exists(destination_file_path), f'{destination_file_path} does not exist'
+        loaded_state_dict = torch.load(path_saved, map_location='cuda')
+        deep_compare(state_dict, loaded_state_dict)
+    else:
+        assert path_saved is None
+
+
+@world_size(2)
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'tensor_type',
+    [
+        'sharded_tensor',
+        pytest.param(
+            'dtensor',
+            marks=pytest.mark.skipif(
+                version.parse(torch.__version__) < version.parse('2.2.0'),
+                reason='Requires torch>=2.2.0 for dtensor',
+            ),
+        ),
+    ],
+)
+def test_save_sharded_state_dict_to_disk(world_size: int, tmp_path: str, tensor_type: str):
+
+    destination_file_path = os.path.join(tmp_path, str(uuid.uuid4())[:8])
+    # Sync the path across all ranks
+    destination_file_path = dist.all_gather_object(destination_file_path)[0]
+    model, _ = init_model(use_fsdp=True, device='cuda', tensor_type=tensor_type)
+
+    state_dict = get_model_state_dict(model, sharded_state_dict=True)
+    loaded_in_state_dict = deepcopy(state_dict)
+    path_saved = save_state_dict_to_disk(state_dict, destination_file_path=destination_file_path, overwrite=True)
+    assert path_saved == f'{destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}'
+    assert path_saved is not None
+    load_path = str(Path(path_saved).parent)
+    if version.parse(torch.__version__) < version.parse('2.2.0'):
+        DCP.load_state_dict(state_dict=loaded_in_state_dict, storage_reader=DCP.FileSystemReader(load_path))
+    else:
+        DCP.load(state_dict=loaded_in_state_dict, storage_reader=DCP.FileSystemReader(load_path))
+    deep_compare(state_dict, loaded_in_state_dict)
diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py
index bd14154dc9..e010440836 100644
--- a/tests/checkpoint/test_state_dict.py
+++ b/tests/checkpoint/test_state_dict.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import datetime
-from typing import Any, Dict
+from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
@@ -10,7 +10,6 @@
 import torch.distributed as torch_dist
 from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.optim import adam
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import DataLoader
 
@@ -25,6 +24,7 @@
 from composer.core import State
 from composer.devices import DeviceCPU, DeviceGPU
 from composer.utils import dist, reproducibility
+from tests.checkpoint.helpers import init_model_and_optimizer
 from tests.common.compare import deep_compare
 from tests.common.markers import world_size
 from tests.common.models import EvenSimplerMLP, SimpleComposerMLP, configure_tiny_gpt2_hf_model
@@ -247,101 +247,10 @@ def test_get_model_state_dict_precision_unsharded_model(precision: str, use_comp
         assert tens.dtype == precision
 
 
-def _init_model_and_optimizer(
-    use_composer_model: bool,
-    num_classes=3,
-    batch_size=5,
-    num_features=8,
-    take_step=True,
-    use_fsdp=False,
-    tensor_type='sharded_tensor',
-    device='cuda',
-):
-    model, loss_fn = _init_model(
-        use_composer_model,
-        num_classes=num_classes,
-        batch_size=batch_size,
-        num_features=num_features,
-        use_fsdp=use_fsdp,
-        tensor_type=tensor_type,
-        device=device,
-    )
-
-    optimizer = _init_optimizer(
-        model,
-        loss_fn,
-        use_composer_model=use_composer_model,
-        num_classes=num_classes,
-        batch_size=batch_size,
-        num_features=num_features,
-        take_step=take_step,
-        device=device,
-    )
-
-    return model, optimizer
-
-
-def _init_model(
-    use_composer_model: bool = False,
-    num_classes=3,
-    batch_size=5,
-    num_features=8,
-    use_fsdp=False,
-    device='cuda',
-    tensor_type='sharded_tensor',
-):
-    if use_composer_model:
-        model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device)
-        loss_fn = model._loss_fn
-    else:
-        model = EvenSimplerMLP(num_features=num_features, num_out_features=num_classes, device=device)
-        loss_fn = torch.nn.CrossEntropyLoss()
-
-    if use_fsdp:
-        fsdp_kwargs: Dict[str, Any] = dict(
-            use_orig_params=True,
-            sync_module_states=True,  # To enable easy comparison between rank 0 unsharded model and full state dict
-        )
-
-        if tensor_type == 'dtensor':
-            from torch.distributed.device_mesh import init_device_mesh
-            device_mesh = init_device_mesh('cuda', (2,))
-            fsdp_kwargs['device_mesh'] = device_mesh
-
-        model = FSDP(
-            model,
-            **fsdp_kwargs,
-        )
-
-    return model, loss_fn
-
-
-def _init_optimizer(
-    model,
-    loss_fn,
-    use_composer_model: bool = False,
-    num_classes=3,
-    batch_size=5,
-    num_features=8,
-    take_step=True,
-    device='cuda',
-):
-    inputs = torch.randn(batch_size, num_features, device=device)
-    targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long)
-    batch = (inputs, targets) if use_composer_model else inputs
-    optimizer = adam.Adam(model.parameters())
-    outputs = model(batch)
-    loss = loss_fn(outputs, targets)
-    loss.backward()
-    if take_step:
-        optimizer.step()
-    return optimizer
-
-
 @pytest.mark.gpu
 @pytest.mark.parametrize('use_composer_model', [True, False])
 def test_get_optim_state_dict_unsharded_model(use_composer_model: bool):
-    model, optimizer = _init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True)
+    model, optimizer = init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True)
     optim_state_dict = get_optim_state_dict(model, optimizer)
 
     # Dict mapping parameter index to optimizer state for that parameter.
@@ -385,7 +294,7 @@ def test_get_optim_state_dict_unsharded_model(use_composer_model: bool):
 )
 @pytest.mark.parametrize('use_composer_model', [True, False])
 def test_get_optim_state_dict_precision_unsharded_model(precision: str, use_composer_model: bool):
-    model, optimizer = _init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True)
+    model, optimizer = init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True)
     optim_state_dict = get_optim_state_dict(model, optimizer, precision=precision)
     for param_state in optim_state_dict['state'].values():
         assert param_state['exp_avg'].dtype == precision
@@ -400,7 +309,7 @@ def test_get_optim_dict_full_for_sharded_model(world_size, tensor_type, use_comp
     if tensor_type == 'dtensor' and version.parse(torch.__version__) < version.parse('2.2.0'):
         pytest.skip('DTensor is only supported in PyTorch >= 2.2.0')
 
-    model, optimizer = _init_model_and_optimizer(
+    model, optimizer = init_model_and_optimizer(
         use_composer_model=use_composer_model,
         take_step=True,
         use_fsdp=True,
@@ -427,7 +336,7 @@ def test_get_optim_dict_sharded_for_sharded_model(world_size, tensor_type, use_c
     if tensor_type == 'dtensor' and version.parse(torch.__version__) < version.parse('2.2.0'):
         pytest.skip('DTensor is only supported in PyTorch >= 2.2.0')
 
-    model, optimizer = _init_model_and_optimizer(
+    model, optimizer = init_model_and_optimizer(
         use_composer_model=use_composer_model,
         take_step=True,
         use_fsdp=True,
@@ -540,7 +449,7 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz
 @pytest.mark.filterwarnings('ignore:SWA has')
 def test_get_resumption_state_dict():
 
-    model, optimizer = _init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu')
+    model, optimizer = init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu')
 
     rank_zero_seed = 10
     run_name = 'test_run'
@@ -605,7 +514,7 @@ def test_get_resumption_state_dict_gpu():
     else:
         from torch.cuda.amp.grad_scaler import GradScaler
 
-    model, _ = _init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda')
+    model, _ = init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda')
 
     rank_zero_seed = 10
     run_name = 'test_run'
diff --git a/tests/common/compare.py b/tests/common/compare.py
index 432ac55dfd..79dfe573bb 100644
--- a/tests/common/compare.py
+++ b/tests/common/compare.py
@@ -7,6 +7,8 @@
 import numpy as np
 import torch
 import torchmetrics
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._tensor import DTensor
 
 from composer import Time
 from composer.core.time import TimeUnit
@@ -39,7 +41,7 @@ def _check_item(
         assert type(item1) == type(item2)
         assert item1 == item2, f'{path} differs: {item1} != {item2}'
         return
-    if isinstance(item1, torch.Tensor):
+    if isinstance(item1, torch.Tensor) and not (isinstance(item1, ShardedTensor) or isinstance(item1, DTensor)):
         assert isinstance(item2, torch.Tensor)
         if item1.device != item2.device:
             item1 = item1.cpu()
@@ -58,6 +60,16 @@ def _check_item(
         assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}'
         _check_list_recursively(item1, item2, path, atol=atol, rtol=rtol)
         return
+    if isinstance(item1, ShardedTensor):
+        assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}'
+        _check_sharded_tensor_recursively(item1, item2, path, atol=atol, rtol=rtol)
+        return
+
+    if isinstance(item1, DTensor):
+        assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}'
+        _check_dtensor_recursively(item1, item2, path, atol=atol, rtol=rtol)
+        return
+
     if isinstance(item1, torchmetrics.Metric):
         assert isinstance(item2, torchmetrics.Metric), f'{path} differs: {item1} != {item2}'
         # Increase update count so Torchmetrics doesn't throw warning when computing two metrics which haven't been updated
@@ -84,6 +96,28 @@ def _check_item(
     raise NotImplementedError(f'Unsupported item type: {type(item1)}')
 
 
+def _check_dtensor_recursively(
+    dtensor1: DTensor,
+    dtensor2: DTensor,
+    path: str,
+    atol: float,
+    rtol: float,
+):
+    tensor1, tensor2 = dtensor1.to_local(), dtensor2.to_local()
+    _check_item(tensor1, tensor2, path, atol=atol, rtol=rtol)
+
+
+def _check_sharded_tensor_recursively(
+    sharded_tensor1: ShardedTensor,
+    sharded_tensor2: ShardedTensor,
+    path: str,
+    atol: float,
+    rtol: float,
+):
+    tensor1, tensor2 = sharded_tensor1.local_tensor(), sharded_tensor2.local_tensor()
+    _check_item(tensor1, tensor2, path, atol=atol, rtol=rtol)
+
+
 def _check_list_recursively(
     list1: Union[tuple[Any], list[Any]],
     list2: Union[tuple[Any], list[Any]],

From 0a1a6a457a7a11b51dcd652cc93dc61a2aef246e Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <saaketh@mosaicml.com>
Date: Mon, 17 Jun 2024 20:14:17 -0700
Subject: [PATCH 21/69] Revert "Optionally use `flash-attn`'s CE loss for
 metrics (#3394)" (#3408)

This reverts commit 2cf9262e988c7cc4ee107259b98efec0298c5017.

revert dat boi
---
 .github/workflows/pr-cpu.yaml       |  2 +-
 composer/devices/device_gpu.py      |  3 -
 composer/metrics/nlp.py             | 22 +------
 tests/checkpoint/test_state_dict.py |  6 +-
 tests/metrics/test_nlp_metrics.py   | 89 -----------------------------
 5 files changed, 4 insertions(+), 118 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 12f471749e..1bdb383823 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -22,7 +22,7 @@ jobs:
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
-          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-doctest
diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py
index 401368576e..19cb0a774a 100644
--- a/composer/devices/device_gpu.py
+++ b/composer/devices/device_gpu.py
@@ -12,7 +12,6 @@
 import torch.backends.cudnn
 import torch.cuda
 import torch.cuda.amp
-import torch.distributed as torch_dist
 import torch.utils.data
 
 from composer.devices.device import Device
@@ -43,8 +42,6 @@ def __init__(
     ):
         if not torch.cuda.is_available():
             raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.')
-        if torch_dist.is_gloo_available():
-            DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo'
         if device_id is None:
             device_id = dist.get_local_rank()
         self._device = torch.device(f'cuda:{device_id}')
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index c1562e5936..e6877292cf 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -83,21 +83,7 @@ def __init__(self, dist_sync_on_step: bool = False, ignore_index: int = -100):
         super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self.ignore_index = ignore_index
-        self.flash_loss_fn = None
-        try:
-            from flash_attn.losses.cross_entropy import CrossEntropyLoss as FusedCrossEntropyLoss
-            log.debug(
-                'Found `flash_attn` installation. Using CrossEntropyLoss from `flash_attn`' +
-                'to compute LanguageCrossEntropy metric for CUDA tensors, which will be faster.',
-            )
-            self.flash_loss_fn = FusedCrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
-        except ImportError:
-            if torch.cuda.is_available():
-                log.debug(
-                    'Package `flash_attn` not installed. Using torch.nn.CrossEntropyLoss ' +
-                    'to compute LanguageCrossEntropy metric for CUDA tensors, which will be slower.',
-                )
-        self.torch_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
+        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
         self.add_state('sum_loss', default=torch.tensor(0.), dist_reduce_fx='sum')
         self.add_state('total_items', default=torch.tensor(0), dist_reduce_fx='sum')
 
@@ -118,11 +104,7 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None:
 
         target = target.view(-1)
         logits = logits.view(target.shape[0], -1)
-        # Use Flash attn's CE loss function, if available, if inputs are both CUDA tensors.
-        if self.flash_loss_fn is not None and target.is_cuda and logits.is_cuda:
-            losses = self.flash_loss_fn(logits, target)
-        else:
-            losses = self.torch_loss_fn(logits, target)
+        losses = self.loss_fn(logits, target)
 
         total_items = (target != self.ignore_index).sum()
         self.total_items += total_items  #type: ignore (third-party)
diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py
index e010440836..4f719254a7 100644
--- a/tests/checkpoint/test_state_dict.py
+++ b/tests/checkpoint/test_state_dict.py
@@ -7,7 +7,6 @@
 
 import pytest
 import torch
-import torch.distributed as torch_dist
 from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.optim.lr_scheduler import StepLR
@@ -440,10 +439,7 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz
         assert 'model_name' in metadata_sd
 
     assert 'dist_backend' in metadata_sd
-    if torch_dist.is_gloo_available():
-        assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo'
-    else:
-        assert metadata_sd['dist_backend'] == 'nccl'
+    assert metadata_sd['dist_backend'] == 'nccl'
 
 
 @pytest.mark.filterwarnings('ignore:SWA has')
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index 9b198003d3..7fe854bd96 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -14,7 +14,6 @@
     LanguagePerplexity,
     MaskedAccuracy,
 )
-from tests.common import device
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
@@ -51,100 +50,12 @@ def test_masked_accuracy(ignore_index, num_classes):
     assert abs(final_acc - (1.0 / num_classes)) < 0.02
 
 
-@device('cpu', 'gpu')
 @pytest.mark.parametrize('ignore_index', [-100])
 @pytest.mark.parametrize('batch_size', [1e2, 1e3])
 @pytest.mark.parametrize('sequence_length', [128])
 @pytest.mark.parametrize('num_classes', [2, 10])
 @pytest.mark.parametrize('minibatch_size', [56, 256, 768])
-@pytest.mark.parametrize('tensor_device', ['cpu', 'gpu'])
 def test_cross_entropy(
-    device: str,
-    batch_size: float,
-    ignore_index: Optional[int],
-    sequence_length: int,
-    num_classes: int,
-    minibatch_size: int,
-    tensor_device: str,
-):
-    """Sanity check to make sure that batched CrossEntropyLoss matches the expected performance.
-
-    Generates a predicted distribution from a normal distribution, and a ground truth from a normal distribution.
-    Verifies Cross Entropy Loss against the baseline performance.
-
-    Args:
-        device (str): the device to run the test on
-        batch_size (int): how many samples are in each batch
-        ignore_index (Optional[int]): if present, the class index to ignore in accuracy calculations.
-        sequence_length (int): the length of the generated sequence
-        num_classes (int): the number of classes in the classification task
-        minibatch_size (int): the minibatch size to simulate for model predictions
-        tensor_device (str): which device the input tensors to the metric are on
-    """
-
-    if device == 'cpu' and tensor_device == 'gpu':
-        pytest.skip('Skipping test that would try to use GPU tensors when only CPU is available.')
-
-    batch_size = int(batch_size)
-    generated_preds = torch.randn((batch_size, sequence_length, num_classes))
-    generated_true = torch.randint(low=0, high=num_classes, size=(batch_size, sequence_length))
-
-    assert ignore_index is not None
-    torchmetrics_xent = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
-    ce_with_keys_metric = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
-
-    if tensor_device == 'cpu':
-        torchmetrics_xent = torchmetrics_xent.to('cpu')
-        ce_with_keys_metric = ce_with_keys_metric.to('cpu')
-    elif tensor_device == 'gpu':
-        torchmetrics_xent = torchmetrics_xent.to('cuda')
-        ce_with_keys_metric = ce_with_keys_metric.to('cuda')
-
-    if device == 'gpu':
-        assert torchmetrics_xent.flash_loss_fn is not None
-
-    labels_mask = torch.rand((batch_size, sequence_length))
-    labels_mask[labels_mask > 0.8] = 1
-    labels_mask[labels_mask <= 0.8] = 0
-    labels_mask = labels_mask.bool()
-    generated_true[labels_mask] = ignore_index
-
-    num_batches = math.ceil(batch_size / minibatch_size)
-    for batch_idx in range(num_batches):
-        begin_idx = (batch_idx * minibatch_size)
-        end_idx = ((batch_idx + 1) * minibatch_size)
-        preds_subset = generated_preds[begin_idx:end_idx]
-        true_subset = generated_true[begin_idx:end_idx]
-
-        if tensor_device == 'cpu':
-            preds_subset = preds_subset.cpu()
-            true_subset = true_subset.cpu()
-        elif tensor_device == 'gpu':
-            preds_subset = preds_subset.cuda()
-            true_subset = true_subset.cuda()
-
-        torchmetrics_xent.update(preds_subset, true_subset)
-        ce_with_keys_metric.update(
-            {
-                'logits': preds_subset.view(-1, num_classes),
-                'loss': cross_entropy(preds_subset.view(-1, num_classes), true_subset.view(-1)),
-            },
-            true_subset.view(-1),
-        )
-
-    torchmetrics_loss = torchmetrics_xent.compute()
-    ce_with_keys_loss = ce_with_keys_metric.compute()
-    correct_loss = cross_entropy(generated_preds.view(-1, num_classes), generated_true.view(-1))
-    assert torchmetrics_loss == ce_with_keys_loss
-    assert torch.isclose(correct_loss, torchmetrics_loss)
-
-
-@pytest.mark.parametrize('ignore_index', [-100])
-@pytest.mark.parametrize('batch_size', [1e2, 1e3])
-@pytest.mark.parametrize('sequence_length', [128])
-@pytest.mark.parametrize('num_classes', [2, 10])
-@pytest.mark.parametrize('minibatch_size', [56, 256, 768])
-def test_torch_cpu_cross_entropy(
     batch_size: float,
     ignore_index: Optional[int],
     sequence_length: int,

From 0d6ef2623f278685b7bff0831e1d46c95dbfb8c4 Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <saaketh@mosaicml.com>
Date: Mon, 17 Jun 2024 20:52:41 -0700
Subject: [PATCH 22/69] CPU tests image fix (#3409)

* Revert "Optionally use `flash-attn`'s CE loss for metrics (#3394)"

This reverts commit 2cf9262e988c7cc4ee107259b98efec0298c5017.

revert dat boi

* remove

* slamm
---
 .github/workflows/pr-cpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 1bdb383823..12f471749e 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -22,7 +22,7 @@ jobs:
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-doctest

From dac19958fd2bfdc48149bd4d10be1a90d2c15fb4 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Tue, 18 Jun 2024 13:29:34 -0400
Subject: [PATCH 23/69] Add setter for epoch in iteration (#3407)

---
 composer/core/time.py | 25 +++++++++++++++++--------
 tests/test_time.py    |  7 +++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/composer/core/time.py b/composer/core/time.py
index 3916dd7659..00af1fd456 100644
--- a/composer/core/time.py
+++ b/composer/core/time.py
@@ -525,13 +525,8 @@ def __init__(
             raise ValueError(f'The `token` argument has units of {token.unit}; not {TimeUnit.TOKEN}.')
         self._token = token
 
-        epoch_in_iteration = Time.from_input(epoch_in_iteration, TimeUnit.EPOCH)
-        if epoch_in_iteration.unit != TimeUnit.EPOCH:
-            raise ValueError((
-                f'The `epoch_in_iteration` argument has units of {epoch_in_iteration.unit}; '
-                f'not {TimeUnit.EPOCH}.'
-            ))
-        self._epoch_in_iteration = epoch_in_iteration
+        self._epoch_in_iteration = Time(0, TimeUnit.EPOCH)
+        self.epoch_in_iteration = epoch_in_iteration
 
         token_in_iteration = Time.from_input(token_in_iteration, TimeUnit.TOKEN)
         if token_in_iteration.unit != TimeUnit.TOKEN:
@@ -619,7 +614,7 @@ def load_state_dict(self, state: dict[str, Any]) -> None:
         if 'iteration' in state:
             self._iteration = Time(state['iteration'], TimeUnit.ITERATION)
         if 'epoch_in_iteration' in state:
-            self._epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH)
+            self.epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH)
         if 'token_in_iteration' in state:
             self._token_in_iteration = Time(state['token_in_iteration'], TimeUnit.TOKEN)
         if 'iteration_wct' in state:
@@ -655,6 +650,20 @@ def epoch_in_iteration(self) -> Time[int]:
         """The epoch count in the current iteration (resets at 0 at the beginning of every iteration)."""
         return self._epoch_in_iteration
 
+    @epoch_in_iteration.setter
+    def epoch_in_iteration(
+        self,
+        epoch_in_iteration: Union[int, Time[int]],  # pyright: ignore[reportPropertyTypeMismatch]
+    ):
+        """Sets epoch count in the current iteration."""
+        epoch_in_iteration = Time.from_input(epoch_in_iteration, TimeUnit.EPOCH)
+        if epoch_in_iteration.unit != TimeUnit.EPOCH:
+            raise ValueError((
+                f'The `epoch_in_iteration` argument has units of {epoch_in_iteration.unit}; '
+                f'not {TimeUnit.EPOCH}.'
+            ))
+        self._epoch_in_iteration = epoch_in_iteration
+
     @property
     def token_in_iteration(self) -> Time[int]:
         """The token count in the current iteration (resets at 0 at the beginning of every iteration)."""
diff --git a/tests/test_time.py b/tests/test_time.py
index 1545eaa3b1..d585d9af36 100644
--- a/tests/test_time.py
+++ b/tests/test_time.py
@@ -146,6 +146,13 @@ def test_timestamp_update():
     assert timestamp is not timestamp_2
 
 
+def test_set_timestamp():
+    timestamp = Timestamp(epoch_in_iteration=1)
+    assert timestamp.epoch_in_iteration == 1
+    timestamp.epoch_in_iteration = 2
+    assert timestamp.epoch_in_iteration == 2
+
+
 def test_timestamp_to_next_batch_epoch_iteration():
     timestamp = Timestamp()
     # Step batch 0 in epoch 0

From 567c6e5065aef2b7a89cd7f7334b489ebf5c0c34 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 18 Jun 2024 13:02:34 -0700
Subject: [PATCH 24/69] Move pillow dep as required (#3412)

* move pil dep

* remove pillow simd
---
 docker/Dockerfile | 21 ---------------------
 setup.py          | 16 +---------------
 2 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e547b44c7b..970af2f1ef 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -25,13 +25,6 @@ ARG PYTORCH_VERSION=1.13.1
 # version that corresponds to the PyTorch version
 ARG TORCHVISION_VERSION=0.14.1
 
-# In the Dockerimage, Pillow-SIMD is installed instead of Pillow. To trick pip into thinking that
-# Pillow is also installed (so it won't override it with a future pip install), a Pillow stub is included
-# PILLOW_PSEUDOVERSION is the Pillow version that pip thinks is installed
-# PILLOW_SIMD_VERSION is the actual version of pillow-simd that is installed.
-ARG PILLOW_PSEUDOVERSION=9.3.0
-ARG PILLOW_SIMD_VERSION=9.0.0.post1
-
 # Version of the Mellanox Drivers to install (for InfiniBand support)
 # Leave blank for no Mellanox Drivers
 ARG MOFED_VERSION=5.5-1.0.3.2
@@ -181,20 +174,6 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \
 RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \
     pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' 'setuptools<70.0.0'
 
-#####################
-# Install pillow-simd
-#####################
-ARG PILLOW_PSEUDOVERSION
-ARG PILLOW_SIMD_VERSION
-
-# pillow_stub tricks pip into thinking that it installed pillow,
-# so when pillow_simd is installed, other packages won't later override it
-COPY pillow_stub /tmp/pillow_stub
-
-RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \
-    pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \
-    rm -rf /tmp/pillow_stub
-
 #################
 # Install Pytorch
 #################
diff --git a/setup.py b/setup.py
index cbffa0b79c..207fe841c9 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'packaging>=21.3.0,<24.2',
     'importlib-metadata>=5.0.0,<7',
     'mosaicml-cli>=0.5.25,<0.7',
+    'pillow>=10.3.0,<11',
 ]
 extra_deps = {}
 
@@ -142,7 +143,6 @@ def package_files(prefix: str, directory: str, extension: str):
     'cryptography==42.0.8',
     'pytest-httpserver>=1.0.4,<1.1',
     'setuptools<=59.5.0',
-    'pillow>=10.3.0,<11',
 ]
 
 extra_deps['system_metrics_monitor'] = {
@@ -280,17 +280,3 @@ def package_files(prefix: str, directory: str, extension: str):
     ext_package='composer',
     cmdclass={'develop': develop},
 )
-
-# only visible if user installs with verbose -v flag
-# Printing to stdout as not to interfere with setup.py CLI flags (e.g. --version)
-print('*' * 20, file=sys.stderr)
-print(
-    textwrap.dedent(
-        """\
-    NOTE: For best performance, we recommend installing Pillow-SIMD
-    for accelerated image processing operations. To install:
-    \t pip uninstall pillow && pip install pillow-simd""",
-    ),
-    file=sys.stderr,
-)
-print('*' * 20, file=sys.stderr)

From f26a1d32b67947f06fbcedfddddf9d46dbaf2d78 Mon Sep 17 00:00:00 2001
From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com>
Date: Tue, 18 Jun 2024 15:44:35 -0700
Subject: [PATCH 25/69] fixing mlflow logging to Databricks workspace file
 paths with /Shared/ prefix (#3410)

* fixing os file path with /Shared/ prefix

* lstrip '/' from experiment name if not '/Shared/' or '/Users/'

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* doesnt modify experiment name if it has '/Shared/' as a prefix

* fix formatting

* lint

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/loggers/mlflow_logger.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 03070c28f9..aed32eea39 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -150,7 +150,12 @@ def __init__(
                 )
             assert self.experiment_name is not None  # type hint
 
-            if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'):
+            if os.getenv(
+                'DATABRICKS_TOKEN',
+            ) is not None and not self.experiment_name.startswith((
+                '/Users/',
+                '/Shared/',
+            )):
                 try:
                     from databricks.sdk import WorkspaceClient
                 except ImportError as e:
@@ -160,7 +165,7 @@ def __init__(
                         conda_channel='conda-forge',
                     ) from e
                 databricks_username = WorkspaceClient().current_user.me().user_name or ''
-                self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name)
+                self.experiment_name = os.path.join('/Users', databricks_username, self.experiment_name.strip('/'))
 
             self._mlflow_client = MlflowClient(self.tracking_uri)
             # Set experiment

From 894a1923393a36900ebab1b45b04ae41f07a5a39 Mon Sep 17 00:00:00 2001
From: Karan Jariwala <karankjariwala@gmail.com>
Date: Tue, 18 Jun 2024 21:52:26 -0700
Subject: [PATCH 26/69] Bump version v0.23.3 (#3414)

* Bump version v0.23.3

* update the composer version
---
 composer/_version.py            |  2 +-
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 12 ++++++------
 docker/generate_build_matrix.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/composer/_version.py b/composer/_version.py
index a38b61a722..1e088461f8 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.24.0.dev0'
+__version__ = '0.23.3'
diff --git a/docker/README.md b/docker/README.md
index 05c97fe626..e10af0a194 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.23.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2`         |
-| 0.23.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` |
+| 0.23.3             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.3`         |
+| 0.23.3             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.3_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 73074988b9..faa21b8e89 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -208,9 +208,9 @@
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: composer-0-23-2
+  IMAGE_NAME: composer-0-23-3
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -231,15 +231,15 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.2
+  - mosaicml/composer:0.23.3
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-23-2-cpu
+  IMAGE_NAME: composer-0-23-3-cpu
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -247,7 +247,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.2_cpu
+  - mosaicml/composer:0.23.3_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index bf961a756c..9a634b0d36 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -231,7 +231,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.23.2']  # Only build images for the latest composer version
+    composer_versions = ['0.23.3']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

From 459a0197ceece6df1d4cbf1de9576b23205cfdca Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 20 Jun 2024 10:20:00 -0700
Subject: [PATCH 27/69] Update numpy requirement from <1.27.0,>=1.21.5 to
 >=1.21.5,<2.1.0 (#3406)

* Update numpy requirement from <1.27.0,>=1.21.5 to >=1.21.5,<2.1.0

Updates the requirements on [numpy](https://github.com/numpy/numpy) to permit the latest version.
- [Release notes](https://github.com/numpy/numpy/releases)
- [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst)
- [Commits](https://github.com/numpy/numpy/compare/v1.21.5...v2.0.0)

---
updated-dependencies:
- dependency-name: numpy
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* commit

* fix typing

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
Co-authored-by: Saaketh Narayan <saaketh@mosaicml.com>
---
 composer/algorithms/augmix/augmix.py | 4 ++--
 setup.py                             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/composer/algorithms/augmix/augmix.py b/composer/algorithms/augmix/augmix.py
index f19ae36243..412fd737d8 100644
--- a/composer/algorithms/augmix/augmix.py
+++ b/composer/algorithms/augmix/augmix.py
@@ -96,8 +96,8 @@ def _augmix_pil_image(
                 aug = np.random.choice(augmentation_set)
                 augmented_image = aug(augmented_image, severity)
             augmented_combination += chain_weights[chain_i] * np.asarray(augmented_image)
-        mixed = (1 - mixing_weight) * np.asarray(img_pil) + mixing_weight * augmented_combination
-        mixed = Image.fromarray(np.uint8(mixed))
+        mixed = (1 - mixing_weight) * np.asarray(img_pil, dtype=np.float32) + mixing_weight * augmented_combination
+        mixed = Image.fromarray(np.uint8(mixed))  # type: ignore
         return mixed
 
     f_pil = functools.partial(
diff --git a/setup.py b/setup.py
index 207fe841c9..29f7a8466b 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'torchvision>=0.13.1,<0.18.2',
     'torch>=2.1.2,<2.3.2',
     'requests>=2.26.0,<3',
-    'numpy>=1.21.5,<1.27.0',
+    'numpy>=1.21.5,<2.1.0',
     'psutil>=5.8.0,<6',
     'coolname>=1.1.0,<3',
     'tabulate==0.9.0',  # for auto-generating tables

From 7a4644acad747f68c430ab6ed56d9aa66cde6555 Mon Sep 17 00:00:00 2001
From: Karan Jariwala <karankjariwala@gmail.com>
Date: Thu, 20 Jun 2024 10:23:19 -0700
Subject: [PATCH 28/69] Restore dev version (#3417)

---
 composer/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/_version.py b/composer/_version.py
index 1e088461f8..a38b61a722 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.23.3'
+__version__ = '0.24.0.dev0'

From 94f1ec16b5ffd665bac0271034cefe5545cf4e2d Mon Sep 17 00:00:00 2001
From: Evan Racah <evan.racah@databricks.com>
Date: Thu, 20 Jun 2024 17:54:21 -0700
Subject: [PATCH 29/69] Save checkpoint to disk for API with new save layout
 (#3399)

---
 composer/callbacks/checkpoint_saver.py |   3 +-
 composer/checkpoint/save.py            | 284 ++++++++++++++++++++++++-
 composer/checkpoint/state_dict.py      |   2 +-
 composer/utils/checkpoint.py           |   1 +
 tests/checkpoint/helpers.py            |  71 ++++++-
 tests/checkpoint/test_save.py          | 151 ++++++++++++-
 tests/checkpoint/test_state_dict.py    |  56 +----
 7 files changed, 507 insertions(+), 61 deletions(-)

diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
index 263558fc2b..c17b874c21 100644
--- a/composer/callbacks/checkpoint_saver.py
+++ b/composer/callbacks/checkpoint_saver.py
@@ -30,6 +30,7 @@
     is_model_deepspeed,
     partial_format,
 )
+from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME
 from composer.utils.compression import get_compressor, is_compressed_pt
 from composer.utils.object_store.mlflow_object_store import MLFLOW_EXPERIMENT_ID_FORMAT_KEY, MLFLOW_RUN_ID_FORMAT_KEY
 
@@ -37,8 +38,6 @@
 
 __all__ = ['CheckpointSaver']
 
-_TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME = '.metadata'
-
 
 class CheckpointSaver(Callback):  # noqa: D101
     __doc__ = f"""Callback to save checkpoints.
diff --git a/composer/checkpoint/save.py b/composer/checkpoint/save.py
index 72e5311d0f..03166d8802 100644
--- a/composer/checkpoint/save.py
+++ b/composer/checkpoint/save.py
@@ -3,12 +3,15 @@
 
 """Useful functions for saving state dicts to disk."""
 
+import json
 import logging
 import os
+import pickle
 import textwrap
 import warnings
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Sequence, Union
 
 import torch
 import torch.distributed.checkpoint as DCP
@@ -16,6 +19,275 @@
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._tensor import DTensor
 
+from composer.checkpoint.state_dict import (
+    get_metadata_state_dict,
+    get_model_state_dict,
+    get_optim_state_dict,
+    get_resumption_state_dict,
+)
+from composer.core import State, Time
+from composer.devices import Device
+from composer.models import ComposerModel
+from composer.utils import dist
+from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file
+from composer.utils.file_helpers import format_name_with_dist_and_time
+
+log = logging.getLogger(__name__)
+
+MODEL_CHECKPOINT_DIRECTORY_NAME = 'model'
+MONOLITHIC_MODEL_CHECKPOINT_FILENAME = 'model.pt'
+OPTIM_CHECKPOINT_DIRECTORY_NAME = 'optim'
+OPTIM_MONO_CHECKPOINT_FILENAME = 'optim.pt'
+METADATA_CHECKPOINT_FILENAME = 'composer_metadata.json'
+RESUMPTION_CHECKPOINT_FILENAME = 'resumption.pkl'
+
+
+@dataclass
+class CheckpointSaveOptions:
+    """Options for saving a checkpoint to disk.
+
+    Args:
+        destination_dir (str): The directory to save the checkpoint to.
+        save_frequency (Union[str, int, Time]): The frequency to save the checkpoint.
+            If '1ep', the checkpoint will be saved after each epoch.
+            If '1ba', the checkpoint will be saved after each batch.
+            If an int, the checkpoint will be saved after that many epochs.
+        dir_prefix (str): The prefix to use for the directory name. Can include {epoch} and {batch}.
+        overwrite (bool): Whether to overwrite the checkpoint if it already exists.
+        save_model (bool): Whether to save the model.
+        save_optimizer (bool): Whether to save the optimizer.
+        save_resumption_state (bool): Whether to save the resumption state.
+        num_checkpoints_to_keep (int): The number of checkpoints to keep.
+            If -1, all checkpoints will be kept.
+        save_format (str): The format to save the model in. 'pt', which is the standard pytorch serializarion, is the only option for now.
+        sharded_checkpoint (bool): Whether to save the model as a sharded checkpoint.
+        precision (str): The precision to save the model in. One of 'bf16', 'fp32', 'fp16', 'fp64'.
+        include_keys (Optional[Union[str, Sequence[str]]]): Keys to include in the saved model.
+        ignore_keys (Optional[Union[str, Sequence[str]]]): Keys to ignore in the saved model.
+    """
+    destination_dir: str
+    save_frequency: Union[str, int, Time] = '1ep'
+    dir_prefix: str = 'ep{epoch}-ba{batch}'
+    overwrite: bool = False
+    save_model: bool = True
+    save_optimizer: bool = True
+    save_resumption_state: bool = True
+    num_checkpoints_to_keep: int = -1
+    save_format: str = 'pt'
+    sharded_checkpoint: bool = False
+    precision: str = 'bf16'
+    include_keys: Optional[Union[str, Sequence[str]]] = None
+    ignore_keys: Optional[Union[str, Sequence[str]]] = None
+
+
+def save_checkpoint_to_disk(
+    state: State,
+    options: Optional[Union[CheckpointSaveOptions, Dict]] = None,
+    destination_dir: Optional[str] = None,
+):
+    """Saves a checkpoint to disk.
+
+    Args:
+        state (State): The state to save.
+        options (Optional[Union[CheckpointSaveOptions, Dict]]): The options for saving the checkpoint.
+            If None, destination_dir must be provided.
+        destination_dir (Optional[str]): The directory to save the checkpoint to.
+            If options is provided, this will overwrite options.destination_dir.
+    """
+    if options is None:
+        if destination_dir is None:
+            raise ValueError('destination_dir must be provided if options is None')
+        options = CheckpointSaveOptions(destination_dir=destination_dir)
+    else:
+        if isinstance(options, Dict):
+            options = CheckpointSaveOptions(**options)
+        if destination_dir is not None:
+            options.destination_dir = destination_dir
+    save_path = os.path.join(options.destination_dir, options.dir_prefix)
+    save_path = format_name_with_dist_and_time(save_path, state.run_name, state.timestamp)
+    os.makedirs(save_path, exist_ok=True)
+    if options.save_model:
+        save_model_to_disk(
+            state.model,
+            save_path,
+            options.sharded_checkpoint,
+            options.precision,
+            options.include_keys,
+            options.ignore_keys,
+            options.overwrite,
+            options.save_format,
+        )
+    if options.save_optimizer:
+        optimizer = state.optimizers[0]
+        save_optim_to_disk(
+            state.model,
+            optimizer,
+            save_path,
+            options.sharded_checkpoint,
+            options.precision,
+            options.overwrite,
+            options.save_format,
+        )
+    if options.save_resumption_state:
+        save_resumption_state_to_disk(state, save_path)
+
+    save_composer_metadata_to_disk(
+        save_path,
+        state.model,
+        options.sharded_checkpoint,
+        options.precision,
+        state.device,
+        state.device_train_microbatch_size,
+    )
+
+
+def save_model_to_disk(
+    model: Union[ComposerModel, torch.nn.Module],
+    destination_dir: str,
+    sharded_checkpoint: bool = False,
+    precision: str = 'fp32',
+    include_keys: Optional[Union[str, Sequence[str]]] = None,
+    ignore_keys: Optional[Union[str, Sequence[str]]] = None,
+    overwrite: bool = False,
+    save_format: str = 'pt',  # or hf, safetensor
+) -> Optional[str]:
+    """Saves a model to disk.
+
+    Args:
+        model (Union[ComposerModel, torch.nn.Module]): The model to save.
+        destination_dir (str): The directory to save the model to.
+            Model will be saved as distination_dir/models/model.pt if sharded_checkpoint is False,
+            otherwise all shards will be saved as destination_dir/models/__<rank>_0.distcp.
+        sharded_checkpoint (bool): Whether to save the model as a sharded checkpoint.
+        precision (str): The precision to save the model in. One of 'bf16', 'fp32', 'fp16', 'fp64'.
+        include_keys (Optional[Union[str, Sequence[str]]]): Keys to include in the saved model.
+        ignore_keys (Optional[Union[str, Sequence[str]]]): Keys to ignore in the saved model.
+        overwrite (bool): If True, the file will be overwritten if it exists.
+        save_format (str): The format to save the model in. One of 'pt', 'hf', or 'safetensor'.
+
+    Returns:
+        str: The full path to the saved model.
+    """
+    if save_format != 'pt':
+        raise NotImplementedError(
+            f"Saving checkpoint in format {save_format} is not supported. Please choose from ['pt'].",
+        )
+    model_state_dict = get_model_state_dict(
+        model,
+        sharded_checkpoint,
+        precision,
+        include_keys,
+        ignore_keys,
+    )
+
+    destination_file_path = (
+        os.path.join(destination_dir, MODEL_CHECKPOINT_DIRECTORY_NAME) if sharded_checkpoint else
+        os.path.join(destination_dir, MODEL_CHECKPOINT_DIRECTORY_NAME, MONOLITHIC_MODEL_CHECKPOINT_FILENAME)
+    )
+    saved_path = save_state_dict_to_disk(
+        state_dict=model_state_dict,
+        destination_file_path=destination_file_path,
+        overwrite=overwrite,
+        save_format=save_format,
+    )
+    return saved_path
+
+
+def save_optim_to_disk(
+    model: Union[ComposerModel, torch.nn.Module],
+    optimizer: torch.optim.Optimizer,
+    destination_dir: str,
+    sharded_checkpoint: bool = False,
+    precision: str = 'fp32',
+    overwrite: bool = False,
+    save_format: str = 'pt',
+) -> Optional[str]:
+    """Saves an optimizer to disk.
+
+    Args:
+        model (Union[ComposerModel, torch.nn.Module]): The model to save.
+        optimizer (torch.optim.Optimizer): The optimizer to save.
+        destination_dir (str): The directory to save the optimizer to.
+            Optimizer will be saved as destination_dir/optim/optim.pt if sharded_checkpoint is False,
+            otherwise all shards will be saved as destination_dir/optim/__<rank>_0.distcp.
+        sharded_checkpoint (bool): Whether to save the optimizer as a sharded checkpoint.
+        precision (str): The precision to save the optimizer in. One of 'bf16', 'fp32', 'fp16', 'fp64'.
+        overwrite (bool): If True, the file will be overwritten if it exists.
+        save_format (str): The format to save the optimizer in. One of 'pt'.
+    """
+    optim_state_dict = get_optim_state_dict(
+        model,
+        optimizer,
+        sharded_state_dict=sharded_checkpoint,
+        precision=precision,
+    )
+    destination_file_path = os.path.join(destination_dir,
+                                         OPTIM_CHECKPOINT_DIRECTORY_NAME) if sharded_checkpoint else os.path.join(
+                                             destination_dir,
+                                             OPTIM_CHECKPOINT_DIRECTORY_NAME,
+                                             OPTIM_MONO_CHECKPOINT_FILENAME,
+                                         )
+    saved_path = save_state_dict_to_disk(
+        state_dict=optim_state_dict,
+        destination_file_path=destination_file_path,
+        overwrite=overwrite,
+        save_format=save_format,
+    )
+
+    return saved_path
+
+
+def save_composer_metadata_to_disk(
+    destination_dir: str,
+    model: Optional[Union[ComposerModel, torch.nn.Module]] = None,
+    sharded_state_dict: Optional[bool] = None,
+    precision: Optional[Union[str, torch.dtype]] = None,
+    device: Optional[Device] = None,
+    device_train_microbatch_size: Optional[Union[int, float]] = None,
+):
+    """Saves metadata about the model to disk.
+
+    Args:
+        destination_dir (str): The directory to save the metadata to.
+        model (Optional[Union[ComposerModel, torch.nn.Module]]): The model to save metadata about.
+        sharded_state_dict (Optional[bool]): Whether the model is sharded.
+        precision (Optional[Union[str, torch.dtype]]): The precision of the model.
+        device (Optional[Device]): The device the model is on.
+        device_train_microbatch_size (Optional[Union[int, float]]): The device train microbatch size.
+    """
+    md_dict = get_metadata_state_dict(
+        model,
+        sharded_state_dict,
+        precision,
+        device,
+        device_train_microbatch_size,
+    )
+    os.makedirs(destination_dir, exist_ok=True)
+    destination_file_path = os.path.join(destination_dir, METADATA_CHECKPOINT_FILENAME)
+
+    if dist.get_global_rank() == 0:
+        with open(destination_file_path, 'w') as f:
+            json.dump(md_dict, f, indent=4)
+    return destination_file_path
+
+
+def save_resumption_state_to_disk(
+    state: State,
+    destination_dir: str,
+):
+    """Saves the resumption state to disk.
+
+    Args:
+        state (State): The state to save.
+        destination_dir (str): The directory to save the resumption state to.
+    """
+    resumption_state_dict = get_resumption_state_dict(state)
+    destination_file_path = os.path.join(destination_dir, RESUMPTION_CHECKPOINT_FILENAME)
+    with open(destination_file_path, 'wb') as f:
+        pickle.dump(resumption_state_dict, f)
+    return destination_file_path
+
+
 from composer.utils import dist
 from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file
 
@@ -80,6 +352,8 @@ def _save_sharded_state_dict_to_disk(
         )
         destination_file_path = stripped_path
 
+    # Wait for all ranks to get here before checking if the directory exists.
+    dist.barrier()
     if dist.get_global_rank() == 0 and not overwrite and os.path.exists(destination_file_path):
         raise ValueError(f'Directory {destination_file_path} already exists. Set overwrite=True to overwrite it.')
 
@@ -94,6 +368,9 @@ def _save_sharded_state_dict_to_disk(
     else:
         DCP.save(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path))
 
+    log.debug(
+        f'Finished saving of sharded state dict to {destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}',
+    )
     return destination_file_path + '/' + _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
 
 
@@ -106,13 +383,14 @@ def _save_full_state_dict_to_disk(
 
     if save_format != 'pt':
         raise NotImplementedError(
-            f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].",
+            f"Saving full state dict to disk in format {save_format} is not supported. Please choose from ['pt'].",
         )
 
     if not overwrite and os.path.exists(destination_file_path):
         raise ValueError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite it.')
 
     if dist.get_global_rank() == 0:
+        os.makedirs(os.path.dirname(destination_file_path), exist_ok=True)
         _write_checkpoint_file(state_dict=state_dict, filename=destination_file_path)
         return destination_file_path
     return None
@@ -130,7 +408,7 @@ def is_state_dict_sharded(state_dict: Dict[str, Any]) -> bool:
     for value in state_dict.values():
         if isinstance(value, ShardedTensor) or isinstance(value, DTensor):
             return True
-        if isinstance(value, Dict):
+        elif isinstance(value, Dict):
             is_sharded = is_state_dict_sharded(value)
             if is_sharded:
                 return True
diff --git a/composer/checkpoint/state_dict.py b/composer/checkpoint/state_dict.py
index a20baaf165..5f82836d7b 100644
--- a/composer/checkpoint/state_dict.py
+++ b/composer/checkpoint/state_dict.py
@@ -380,7 +380,7 @@ def get_metadata_state_dict(
     sharded_state_dict: Optional[bool] = None,
     precision: Optional[Union[str, torch.dtype]] = None,
     device: Optional[Device] = None,
-    device_train_microbatch_size: Optional[int] = None,
+    device_train_microbatch_size: Optional[Union[int, float]] = None,
 ) -> dict[str, Any]:
     """Generate the metadata and integrations for a training run.
 
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index f2342eeb4c..f9ad516724 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -53,6 +53,7 @@
 _COMPOSER_STATES_FILENAME = 'composer_states.pt'
 _DEEPSPEED_TAG = 'deepspeed'  # always tag with the same, deterministic name. We'll rename the tarball to the appropriate name.
 _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME = f'__{dist.get_global_rank()}_0.distcp'
+_TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME = '.metadata'
 
 
 def _get_checkpoint_validation_function(
diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py
index 047d30e813..4915c3a150 100644
--- a/tests/checkpoint/helpers.py
+++ b/tests/checkpoint/helpers.py
@@ -1,24 +1,85 @@
 # Copyright 2024 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict
+from typing import Any, Dict, Tuple, Union
+from unittest.mock import MagicMock
 
 import torch
+from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import CPUOffload
 from torch.optim import adam
-
+from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import DataLoader
+
+from composer.algorithms import SWA
+from composer.callbacks import SpeedMonitor
+from composer.core import State
+from composer.devices import Device, DeviceCPU, DeviceGPU
+from composer.models import ComposerModel
 from tests.common.models import EvenSimplerMLP, SimpleComposerMLP
 
 __all__ = [
     'init_model_and_optimizer',
     'init_model',
     'init_optimizer',
+    'init_state',
 ]
 
 
+def init_state(
+    use_fsdp: bool = False,
+    device: str = 'cpu',
+    include_schedulers=False,
+    include_callbacks=False,
+    include_algorithms=False,
+    use_grad_scaler=False,
+    rank_zero_seed=10,
+    run_name='test_run',
+    take_step=False,
+) -> State:
+    model, optimizer = init_model_and_optimizer(
+        use_fsdp=use_fsdp,
+        use_composer_model=True,
+        take_step=take_step,
+        device=device,
+    )
+
+    test_dataset_sd = {'test': 0}
+    device_obj: Device = DeviceCPU() if device == 'cpu' else DeviceGPU()
+
+    dataloader = MagicMock(spec=DataLoader)
+    dataloader.dataset = MagicMock()
+    dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd)
+    kwargs = {}
+
+    if include_callbacks:
+        kwargs['callbacks'] = [SpeedMonitor(), SpeedMonitor()]
+    if include_algorithms:
+        kwargs['algorithms'] = [SWA()]
+    if use_grad_scaler:
+        if version.parse(torch.__version__) >= version.parse('2.3.0'):
+            from torch.amp.grad_scaler import GradScaler
+        else:
+            from torch.cuda.amp.grad_scaler import GradScaler
+        kwargs['scaler'] = GradScaler()
+
+    state = State(
+        model=model,
+        rank_zero_seed=rank_zero_seed,
+        run_name=run_name,
+        device=device_obj,
+        train_dataloader=dataloader,
+        optimizers=[optimizer],
+        **kwargs,
+    )
+    if include_schedulers:
+        state.schedulers = StepLR(optimizer=optimizer, step_size=2)
+    return state
+
+
 def init_model_and_optimizer(
-    use_composer_model: bool,
+    use_composer_model: bool = True,
     num_classes=3,
     batch_size=5,
     num_features=8,
@@ -26,7 +87,7 @@ def init_model_and_optimizer(
     use_fsdp=False,
     tensor_type='sharded_tensor',
     device='cuda',
-):
+) -> Tuple[Union[ComposerModel, torch.nn.Module], torch.optim.Optimizer]:
     model, loss_fn = init_model(
         use_composer_model,
         num_classes=num_classes,
@@ -59,7 +120,7 @@ def init_model(
     tensor_type='sharded_tensor',
     sync_module_states=True,
     cpu_offload=False,
-):
+) -> Tuple[Union[ComposerModel, torch.nn.Module], Any]:
     if use_composer_model:
         model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device)
         loss_fn = model._loss_fn
diff --git a/tests/checkpoint/test_save.py b/tests/checkpoint/test_save.py
index 03b12bbcbc..f4d41cc09d 100644
--- a/tests/checkpoint/test_save.py
+++ b/tests/checkpoint/test_save.py
@@ -1,6 +1,7 @@
 # Copyright 2024 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import os
 import time
 import uuid
@@ -12,15 +13,157 @@
 import torch.distributed.checkpoint as DCP
 from packaging import version
 
-from composer.checkpoint.save import save_state_dict_to_disk
-from composer.checkpoint.state_dict import get_model_state_dict
+from composer.checkpoint.save import (
+    save_checkpoint_to_disk,
+    save_composer_metadata_to_disk,
+    save_model_to_disk,
+    save_optim_to_disk,
+    save_state_dict_to_disk,
+)
+from composer.checkpoint.state_dict import get_model_state_dict, get_optim_state_dict
+from composer.core import Timestamp
 from composer.utils import dist
-from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
-from tests.checkpoint.helpers import init_model
+from composer.utils.checkpoint import (
+    _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME,
+    _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME,
+)
+from tests.checkpoint.helpers import init_model, init_model_and_optimizer, init_state
 from tests.common.compare import deep_compare
 from tests.common.markers import world_size
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'world_size,sharded_model,sharded_checkpoint',
+    [
+        pytest.param(1, False, False, marks=pytest.mark.world_size(1)),
+        pytest.param(2, True, True, marks=pytest.mark.world_size(2)),
+        pytest.param(2, True, False, marks=pytest.mark.world_size(2)),
+    ],
+)
+@pytest.mark.filterwarnings('ignore::UserWarning')
+def test_save_checkpoint_to_disk(world_size: int, tmp_path: str, sharded_model: bool, sharded_checkpoint: bool):
+    destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8])
+    destination_dir = dist.all_gather_object(destination_dir)[0]
+    save_options = {
+        'destination_dir': destination_dir,
+        'save_model': True,
+        'save_optimizer': True,
+        'save_resumption_state': True,
+        'sharded_checkpoint': sharded_checkpoint,
+        'dir_prefix': 'ep{epoch}-ba{batch}',
+    }
+    state = init_state(use_fsdp=sharded_model, device='cuda', take_step=True)
+    state.run_name = 'foo'
+    state.timestamp = Timestamp()
+    expected_destination_dir = os.path.join(destination_dir, 'ep0-ba0')
+    save_checkpoint_to_disk(state, save_options)
+    expected_model_dir = os.path.join(expected_destination_dir, 'model')
+    expected_optim_dir = os.path.join(expected_destination_dir, 'optim')
+    expected_metadata_filepath = os.path.join(expected_destination_dir, 'composer_metadata.json')
+    expected_resumption_filepath = os.path.join(expected_destination_dir, 'resumption.pkl')
+    if sharded_checkpoint:
+        checkpoint_filenames = dist.all_gather_object(_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME)
+        for checkpoint_filename in checkpoint_filenames:
+            assert os.path.exists(os.path.join(expected_model_dir, checkpoint_filename))
+            assert os.path.exists(os.path.join(expected_optim_dir, checkpoint_filename))
+        assert os.path.exists(os.path.join(expected_model_dir, _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME))
+        assert os.path.exists(os.path.join(expected_optim_dir, _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME))
+    else:
+        assert os.path.exists(os.path.join(expected_model_dir, 'model.pt'))
+        assert os.path.exists(os.path.join(expected_optim_dir, 'optim.pt'))
+
+    import time
+
+    # Need to wait for the file to be written to avoid flaky test.
+    time.sleep(0.2)
+    assert os.path.exists(expected_metadata_filepath)
+    assert os.path.exists(expected_resumption_filepath)
+
+
+def test_save_composer_metadata_to_disk(tmp_path: str):
+    destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8])
+    destination_dir = dist.all_gather_object(destination_dir)[0]
+    save_composer_metadata_to_disk(destination_dir)
+    expected_file_path = os.path.join(destination_dir, 'composer_metadata.json')
+    assert os.path.exists(expected_file_path)
+    json.load(open(expected_file_path, 'r'))
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'world_size,sharded_optimizer,sharded_checkpoint',
+    [
+        pytest.param(1, False, False, marks=pytest.mark.world_size(1)),
+        pytest.param(2, True, True, marks=pytest.mark.world_size(2)),
+        pytest.param(2, True, False, marks=pytest.mark.world_size(2)),
+    ],
+)
+def test_save_optim_to_disk(world_size: int, tmp_path: str, sharded_optimizer: bool, sharded_checkpoint: bool):
+    destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8])
+    # Sync the path across all ranks
+    destination_dir = dist.all_gather_object(destination_dir)[0]
+    use_fsdp = sharded_optimizer
+    model, optim = init_model_and_optimizer(use_fsdp=use_fsdp, device='cuda')
+    optim_state_dict = get_optim_state_dict(model, optimizer=optim, sharded_state_dict=sharded_checkpoint)
+    optim_state_dict_saved = deepcopy(optim_state_dict)
+    save_optim_to_disk(model, optim, destination_dir=destination_dir, sharded_checkpoint=sharded_checkpoint)
+
+    # Load new optim from disk
+    model, optim = init_model_and_optimizer(use_fsdp=use_fsdp, device='cuda')
+    cur_state_dict = get_optim_state_dict(model, optimizer=optim, sharded_state_dict=sharded_checkpoint)
+
+    if sharded_checkpoint:
+        expected_file_path = os.path.join(destination_dir, 'optim')
+        if version.parse(torch.__version__) < version.parse('2.2.0'):
+            DCP.load_state_dict(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path))
+        else:
+            DCP.load(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path))
+    else:
+        if dist.get_global_rank() == 0:
+            expected_file_path = os.path.join(destination_dir, 'optim', 'optim.pt')
+            cur_state_dict = torch.load(expected_file_path, map_location='cuda')
+
+    deep_compare(optim_state_dict_saved, cur_state_dict)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'world_size,sharded_model,sharded_checkpoint',
+    [
+        pytest.param(1, False, False, marks=pytest.mark.world_size(1)),
+        pytest.param(2, True, True, marks=pytest.mark.world_size(2)),
+        pytest.param(2, True, False, marks=pytest.mark.world_size(2)),
+    ],
+)
+def test_save_model_to_disk(world_size: int, tmp_path: str, sharded_model: bool, sharded_checkpoint: bool):
+    destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8])
+    # Sync the path across all ranks
+    destination_dir = dist.all_gather_object(destination_dir)[0]
+    use_fsdp = sharded_model
+    model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True)
+    state_dict = get_model_state_dict(model, sharded_state_dict=sharded_checkpoint)
+    state_dict_saved = deepcopy(state_dict)
+    save_model_to_disk(model, destination_dir=destination_dir, sharded_checkpoint=sharded_checkpoint)
+
+    # Load new model from disk
+    new_model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True)
+    cur_state_dict = get_model_state_dict(new_model, sharded_state_dict=sharded_checkpoint)
+
+    if sharded_checkpoint:
+        expected_file_path = os.path.join(destination_dir, 'model')
+        if version.parse(torch.__version__) < version.parse('2.2.0'):
+            DCP.load_state_dict(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path))
+        else:
+            DCP.load(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path))
+    else:
+        if dist.get_global_rank() == 0:
+            expected_file_path = os.path.join(destination_dir, 'model', 'model.pt')
+            cur_state_dict = torch.load(expected_file_path, map_location='cuda')
+
+    deep_compare(state_dict_saved, cur_state_dict)
+
+
 @world_size(1, 2)
 @pytest.mark.gpu
 @pytest.mark.parametrize('sharded_model', [False, True])
diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py
index 4f719254a7..12fde27249 100644
--- a/tests/checkpoint/test_state_dict.py
+++ b/tests/checkpoint/test_state_dict.py
@@ -3,27 +3,21 @@
 
 import datetime
 from typing import Any
-from unittest.mock import MagicMock
 
 import pytest
 import torch
 from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.optim.lr_scheduler import StepLR
-from torch.utils.data import DataLoader
 
-from composer.algorithms import SWA
-from composer.callbacks import SpeedMonitor
 from composer.checkpoint import (
     get_metadata_state_dict,
     get_model_state_dict,
     get_optim_state_dict,
     get_resumption_state_dict,
 )
-from composer.core import State
-from composer.devices import DeviceCPU, DeviceGPU
+from composer.devices import DeviceGPU
 from composer.utils import dist, reproducibility
-from tests.checkpoint.helpers import init_model_and_optimizer
+from tests.checkpoint.helpers import init_model_and_optimizer, init_state
 from tests.common.compare import deep_compare
 from tests.common.markers import world_size
 from tests.common.models import EvenSimplerMLP, SimpleComposerMLP, configure_tiny_gpt2_hf_model
@@ -444,27 +438,17 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz
 
 @pytest.mark.filterwarnings('ignore:SWA has')
 def test_get_resumption_state_dict():
-
-    model, optimizer = init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu')
-
-    rank_zero_seed = 10
     run_name = 'test_run'
-    device = DeviceCPU()
-    test_dataset_sd = {'foo': 0}
-    dataloader = MagicMock(spec=DataLoader)
-    dataloader.dataset = MagicMock()
-    dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd)
-    swa = SWA()
-    state = State(
-        model=model,
+    rank_zero_seed = 10
+    state = init_state(
+        device='cpu',
+        include_algorithms=True,
+        include_callbacks=True,
+        include_schedulers=True,
         rank_zero_seed=rank_zero_seed,
         run_name=run_name,
-        device=device,
-        train_dataloader=dataloader,
-        algorithms=[swa],
-        callbacks=[SpeedMonitor(), SpeedMonitor()],
     )
-    state.schedulers = StepLR(optimizer=optimizer, step_size=2)
+    test_dataset_sd = {'test': 0}
     rsd = get_resumption_state_dict(state)
 
     assert rsd['rank_zero_seed'] == rank_zero_seed
@@ -505,27 +489,7 @@ def test_get_resumption_state_dict():
 
 @pytest.mark.gpu
 def test_get_resumption_state_dict_gpu():
-    if version.parse(torch.__version__) >= version.parse('2.3.0'):
-        from torch.amp.grad_scaler import GradScaler
-    else:
-        from torch.cuda.amp.grad_scaler import GradScaler
-
-    model, _ = init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda')
-
-    rank_zero_seed = 10
-    run_name = 'test_run'
-    device = DeviceCPU()
-    test_dataset_sd = {'test': 0}
-    dataloader = MagicMock()
-    dataloader.dataset = MagicMock()
-    dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd)
-    state = State(
-        model=model,
-        rank_zero_seed=rank_zero_seed,
-        run_name=run_name,
-        device=device,
-        scaler=GradScaler(),
-    )
+    state = init_state(device='cuda', use_grad_scaler=True)
     rsd = get_resumption_state_dict(state)
     assert 'scaler' in rsd
     assert set(

From d420765ba09ecf6c2965a18e720447d594db4065 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 20 Jun 2024 21:04:09 -0700
Subject: [PATCH 30/69] fix typing (#3419)

---
 composer/trainer/_patch_pytorch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 6771c5db4b..3f19df7d2a 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -933,7 +933,8 @@ def device_mesh__getitem__(self, mesh_dim_names: Union[str, tuple[str]]) -> 'Dev
             return submesh
 
     else:
-        from torch.distributed.device_mesh import _mesh_resources
+        from torch.utils._typing_utils import not_none
+        from torch.distributed.device_mesh import DeviceMesh, _mesh_resources
 
         def create_child_mesh(
             self, parent_mesh: 'DeviceMesh', submesh_dim_names: Tuple[str, ...],

From ba1789789510bc0d6705b473b338998a2b22d324 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 20 Jun 2024 22:35:49 -0700
Subject: [PATCH 31/69] Fixes some typing issues (#3418)

---
 composer/callbacks/eval_output_logging_callback.py |  4 ++++
 composer/core/evaluator.py                         | 10 +++++-----
 composer/loggers/mlflow_logger.py                  |  4 +++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/composer/callbacks/eval_output_logging_callback.py b/composer/callbacks/eval_output_logging_callback.py
index fd52b33960..717994413a 100644
--- a/composer/callbacks/eval_output_logging_callback.py
+++ b/composer/callbacks/eval_output_logging_callback.py
@@ -114,6 +114,10 @@ def eval_batch_end(self, state: State, logger: Logger) -> None:
         self.rows.extend(rows)
 
     def eval_end(self, state: State, logger: Logger) -> None:
+        # eval_batch_end will have set these if there is anything to log
+        if self.name is None or self.columns is None:
+            return
+
         list_of_rows = dist.all_gather_object(self.rows)
         rows = [row for rows in list_of_rows for row in rows]
         for dest_logger in logger.destinations:
diff --git a/composer/core/evaluator.py b/composer/core/evaluator.py
index 767131bc35..d1ef6c947e 100644
--- a/composer/core/evaluator.py
+++ b/composer/core/evaluator.py
@@ -67,7 +67,7 @@ class Evaluator:
 
             When specifying ``eval_interval``, the evaluator(s) are also run at the ``Event.FIT_END`` if it doesn't
             evenly divide the training duration.
-        device_eval_microbatch_size (int, optional): The number of samples to use for each microbatch when evaluating.
+        device_eval_microbatch_size (str | int | float, optional): The number of samples to use for each microbatch when evaluating.
             If set to ``auto``, dynamically decreases device_eval_microbatch_size if microbatch is too large for GPU.
             If None, sets `device_eval_microbatch_size` to per rank batch size. (default: ``None``)
     """
@@ -80,7 +80,7 @@ def __init__(
         metric_names: Optional[list[str]] = None,
         subset_num_batches: Optional[int] = None,
         eval_interval: Optional[Union[int, str, Time, Callable[[State, Event], bool]]] = None,
-        device_eval_microbatch_size: Optional[Union[int, str]] = None,
+        device_eval_microbatch_size: Optional[Union[int, str, float]] = None,
     ):
         self.label = label
         self.dataloader = ensure_data_spec(dataloader)
@@ -142,7 +142,7 @@ def ensure_evaluator(evaluator: Union[Evaluator, DataSpec, Iterable, dict[str, A
         )
 
 
-def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str]]):
+def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str, float]]):
     if device_eval_microbatch_size == 'auto':
         warnings.warn((
             "Setting `device_eval_microbatch_size='auto'` is an experimental feature which may cause "
@@ -155,10 +155,10 @@ def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str]
 
 
 def _get_initial_device_eval_microbatch_size(
-    device_eval_microbatch_size: Optional[Union[int, str]],
+    device_eval_microbatch_size: Optional[Union[int, str, float]],
     auto_microbatching: bool,
     dataloader: Iterable,
-) -> int:
+) -> Union[int, float]:
     """Sets initial value of device_eval_microbatch_size.
 
     If auto_microbatching, sets initial `device_eval_microbatch_size` to per rank batch size.
diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index aed32eea39..526a7962fd 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -185,6 +185,9 @@ def __init__(
     def _start_mlflow_run(self, state):
         import mlflow
 
+        # This function is only called if self._enabled is True, and therefore self._experiment_id is not None.
+        assert self._experiment_id is not None
+
         env_run_id = os.getenv(
             mlflow.environment_variables.MLFLOW_RUN_ID.name,  # pyright: ignore[reportGeneralTypeIssues]
             None,
@@ -193,7 +196,6 @@ def _start_mlflow_run(self, state):
             self._run_id = env_run_id
         elif self.resume:
             # Search for an existing run tagged with this Composer run if `self.resume=True`.
-            assert self._experiment_id is not None
             run_name = self.tags['run_name']
             existing_runs = mlflow.search_runs(
                 experiment_ids=[self._experiment_id],

From 4e8ed2eaee03f2b2ed99a3158489f97c3e12a370 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Fri, 21 Jun 2024 16:02:24 -0400
Subject: [PATCH 32/69] Fix small things (#3420)

---
 composer/core/state.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index fa4feaec75..a1bb14f0af 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -759,7 +759,7 @@ def _iteration_length(self):
     def _iteration_length(self, iteration_length: Optional[Union[str, Time[int]]]):
         """Sets the length of an iteration.
 
-        An iteration must be defined as multiple epochs. See composer/core/event.py.
+        An iteration must be defined as multiple epochs or tokens. See composer/core/event.py.
         """
         if iteration_length is None:
             self.__iteration_length = None
@@ -777,7 +777,7 @@ def stop_training(self):
         logging, and evaluation for that batch, as well as any epoch end events.
         """
         # Set the max_duration to the current time in its unit, except if the unit is TimeUnit.EPOCH. This is because TimeUnit.EPOCH is a very crude way to measure max duration. For example, it will result in division by zero error while computing get_elapsed_duration: https://github.com/mosaicml/composer/blob/1b9c6d3c0592183b947fd89890de0832366e33a7/composer/core/state.py#L641
-        if self.max_duration is not None and Time.from_input(self.max_duration,).unit != TimeUnit.EPOCH:
+        if self.max_duration is not None and Time.from_input(self.max_duration).unit != TimeUnit.EPOCH:
             max_duration_unit = Time.from_input(self.max_duration).unit
             self.max_duration = self.timestamp.get(max_duration_unit)
         else:

From 5ba56acf040fd676def4780fcd201e76649eb96d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Jun 2024 07:52:48 -0700
Subject: [PATCH 33/69] Bump coverage[toml] from 7.5.3 to 7.5.4 (#3422)

Bumps [coverage[toml]](https://github.com/nedbat/coveragepy) from 7.5.3 to 7.5.4.
- [Release notes](https://github.com/nedbat/coveragepy/releases)
- [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst)
- [Commits](https://github.com/nedbat/coveragepy/compare/7.5.3...7.5.4)

---
updated-dependencies:
- dependency-name: coverage[toml]
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 29f7a8466b..dc0be75cd3 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def package_files(prefix: str, directory: str, extension: str):
     # Should manually update dependency versions occassionally.
     'custom_inherit==2.4.1',
     'junitparser==3.1.2',
-    'coverage[toml]==7.5.3',
+    'coverage[toml]==7.5.4',
     'fasteners==0.18',  # object store tests require fasteners
     'pytest==7.4.4',
     'ipython==8.11.0',

From abfd78c3a380b92ff26d842c9f3ca0d72cefb5f7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Jun 2024 08:31:41 -0700
Subject: [PATCH 34/69] Update psutil requirement from <6,>=5.8.0 to >=5.8.0,<7
 (#3424)

Updates the requirements on [psutil](https://github.com/giampaolo/psutil) to permit the latest version.
- [Changelog](https://github.com/giampaolo/psutil/blob/master/HISTORY.rst)
- [Commits](https://github.com/giampaolo/psutil/compare/release-5.8.0...release-6.0.0)

---
updated-dependencies:
- dependency-name: psutil
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index dc0be75cd3..0afe4ed314 100644
--- a/setup.py
+++ b/setup.py
@@ -84,7 +84,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'torch>=2.1.2,<2.3.2',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.1.0',
-    'psutil>=5.8.0,<6',
+    'psutil>=5.8.0,<7',
     'coolname>=1.1.0,<3',
     'tabulate==0.9.0',  # for auto-generating tables
     'py-cpuinfo>=8.0.0,<10',

From d3e95a92ac8fa37914ff67eac32ef43a48fdbc5f Mon Sep 17 00:00:00 2001
From: Joe Early <joe.early@helsing.ai>
Date: Mon, 24 Jun 2024 19:45:54 +0100
Subject: [PATCH 35/69] Add support for variable length dataloaders in DDP
 (#3416)

* Add support for variable length dataloaders in dist training

* Remove test file

* Fix typo

* Fixed batch referenced before assignment

* Replace sentinel with None

* Add unit test

* Update unit test

* Reduce tensor creation to one line

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* Remove requirement for gpu in test

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/trainer/trainer.py   | 13 ++++++++++++
 tests/trainer/test_trainer.py | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 4447698beb..91dd0b1e19 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -3640,6 +3640,11 @@ def _iter_dataloader(self, trainer_mode: TrainerMode):
         else:
             dataloader_iter = itertools.islice(self.state.dataloader, int(self.state.dataloader_len))
 
+        # Track if iteration has finished (used for distributed training when we have variable length dataloaders)
+        # 0 = not finished, 1 = finished (using integer tensors so we can use dist.all_reduce)
+        iter_finished = self.state.device.tensor_to_device(torch.zeros(1, dtype=torch.uint8))
+
+        batch = None
         while True:
             try:
                 # [BEFORE/AFTER]_DATALOADER only runs while training
@@ -3655,7 +3660,15 @@ def _iter_dataloader(self, trainer_mode: TrainerMode):
                     # Otherwise, we will encounter an error at the start of the next epoch when
                     # Event.BEFORE_DATALOADER tries to start an unfinished marker.
                     self.engine.run_marker_only_event(Event.AFTER_DATALOADER)
+                # Mark iteration as finished - don't break yet as we need to sync across ranks
+                iter_finished += 1
+
+            # Sync iter finished across ranks
+            dist.all_reduce(iter_finished, reduce_operation='MAX')
+            # If any rank has finished, stop all rank iterations
+            if iter_finished.item() == 1:
                 break
+
             yield batch
 
     def _use_closures(self) -> bool:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 59e8b26782..1bb5d265b6 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1250,6 +1250,43 @@ def test_accumulate_time_across_ranks(
         assert num_tokens_accum == num_tokens * 2
         assert batch_time_accum == datetime.timedelta(seconds=0.1 * (1 + 0))
 
+    @pytest.mark.world_size(2)
+    def test_rank_dependent_dataloader_lengths(
+        self,
+        model: ComposerModel,
+        max_duration: Time[int],
+    ):
+        # Change rank 1 dataloader size to create different sized dataloaders on each rank
+        batch_size = 4
+        orig_num_samples = 16
+        rank_num_samples = orig_num_samples + 8 if dist.get_local_rank() == 1 else orig_num_samples
+        # Create train and eval dataloaders (will have rank-dependent lengths)
+        train_dataset = RandomClassificationDataset(size=rank_num_samples)
+        train_dataloader = DataLoader(
+            dataset=train_dataset,
+            batch_size=batch_size,
+            sampler=dist.get_sampler(train_dataset),
+        )
+        eval_dataset = RandomClassificationDataset(size=rank_num_samples)
+        eval_dataloader = DataLoader(
+            dataset=eval_dataset,
+            batch_size=batch_size,
+            sampler=dist.get_sampler(eval_dataset),
+        )
+        # Fit (train + eval)
+        trainer = Trainer(
+            model=model,
+            max_duration=max_duration,
+            train_dataloader=train_dataloader,
+            eval_dataloader=eval_dataloader,
+        )
+        trainer.fit()
+        # Check the correct number of samples and batches have been processed
+        assert trainer.state.timestamp.sample.value == orig_num_samples
+        assert trainer.state.timestamp.batch.value == orig_num_samples / batch_size / 2
+        assert trainer.state.eval_timestamp.sample.value == orig_num_samples
+        assert trainer.state.eval_timestamp.batch.value == orig_num_samples / batch_size / 2
+
 
 @world_size(1, 2)
 @device('cpu', 'gpu', 'gpu-amp', precision=True)

From 84c4723108d3e378d20591a51adc82e84a13ccfc Mon Sep 17 00:00:00 2001
From: Vincent Chen <vincent@mosaicml.com>
Date: Mon, 24 Jun 2024 18:30:50 -0400
Subject: [PATCH 36/69] Hsdp + MoE CI tests (#3378)

* fold ema fsdp state

* debug

* debug

* more debug

* keep debugging

* debug

* sanity check

* debug

* debug

* use ema

* debug

* debug

* debug

* debug

* debug

* debug

* more fix

* filename test

* revert test

* fully parameterize

* hsdp test

* revert testing

* typo

* typo

* hsdp

* split off test

* precommit

* float to int

* pyright

* oom

* print

* rm tp

* tp cfg

* tp?

* rm tp line

* type annotation

* revert

* readd tp

* type

* world size

* revert

* revert monolithic cpkt + include sharded cpkt

* enumerate

* precommit

* precommit

* sharded

* sync

* only sync on first trainer

* typo

* hsdp

* xfail

* explicit sync

* test

* revert test

* sync, docker issue

* pre-commit

* sync

* pytest

* xfail

* rm world_size param

* im so sorry pls forgive me king

* the kings comments

* Update tests/trainer/test_fsdp_checkpoint.py

fix formatting

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* precommit

---------

Co-authored-by: v-chen_data <v-chen_data@example.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 tests/trainer/test_fsdp_checkpoint.py | 86 ++++++++++++++++++---------
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index bb99a9287e..2e5fd5d07b 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -289,21 +289,21 @@ def _compare_timestamps_between_state_dicts(state_dict1, state_dict2):
 @pytest.mark.gpu
 @pytest.mark.filterwarnings(r'ignore:.*scatter_full_optim_state_dict``is being deprecated.*:UserWarning')
 @pytest.mark.parametrize(
-    'world_size,optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp',
+    'optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp,use_hsdp',
     [
-        pytest.param(2, 'adam', False, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, 'adamw', False, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, 'adam', True, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, 'adam', False, 'amp_fp16', False, False, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, 'adam', False, 'amp_bf16', True, True, False, False,
+        pytest.param('adam', False, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adamw', False, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adam', True, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adam', False, 'amp_fp16', False, False, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adam', False, 'amp_bf16', True, True, False, False, False,
                      marks=pytest.mark.world_size(2)),  # save_weights_only requires load_weights_only
-        pytest.param(2, 'adam', False, 'amp_bf16', False, True, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, 'adam', False, 'amp_bf16', False, False, True, False, marks=pytest.mark.world_size(2)),
-        pytest.param(4, 'adam', False, 'amp_bf16', False, False, False, True, marks=pytest.mark.world_size(4)),
+        pytest.param('adam', False, 'amp_bf16', False, True, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adam', False, 'amp_bf16', False, False, True, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param('adam', False, 'amp_bf16', False, False, False, True, False, marks=pytest.mark.world_size(4)),
+        pytest.param('adam', False, 'amp_bf16', False, False, False, False, True, marks=pytest.mark.world_size(4)),
     ],
 )
 def test_fsdp_full_state_dict_load(
-    world_size,
     tmp_path: pathlib.Path,
     autoresume: bool,
     precision: str,
@@ -312,7 +312,10 @@ def test_fsdp_full_state_dict_load(
     load_weights_only: bool,
     load_monolith_rank0_only: bool,
     use_tp: bool,
+    use_hsdp: bool,
 ):
+    if use_hsdp:
+        pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch')
     if autoresume:
         run_name = 'my-cool-autoresume-run'
     else:
@@ -320,11 +323,20 @@ def test_fsdp_full_state_dict_load(
     save_folder = tmp_path
     save_filename = 'rank{rank}.pt'
 
-    fsdp_config = FSDPConfig(
-        sharded_ckpt_prefix_dir='ba{batch}',
-        sync_module_states=load_monolith_rank0_only,
-        load_monolith_rank0_only=load_monolith_rank0_only,
-    )
+    if use_hsdp:
+        fsdp_config = FSDPConfig(
+            sharding_strategy='HYBRID_SHARD',
+            sharded_ckpt_prefix_dir='ba{batch}',
+            data_parallel_shard_degree=2,
+            data_parallel_replicate_degree=2,
+            sync_module_states=True,
+        )
+    else:
+        fsdp_config = FSDPConfig(
+            sharded_ckpt_prefix_dir='ba{batch}',
+            sync_module_states=load_monolith_rank0_only,
+            load_monolith_rank0_only=load_monolith_rank0_only,
+        )
     tp_config = None
     if use_tp:
         from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
@@ -778,23 +790,33 @@ def mock_get_checkpoint_validation_function():
 @pytest.mark.gpu
 @pytest.mark.parametrize('use_remote', [pytest.param(True, marks=pytest.mark.remote), False])
 @pytest.mark.parametrize(
-    'world_size,weights_only,optimizer,precision,autoresume,load_ignore_keys,use_symlink,use_tp',
+    'weights_only,optimizer,precision,autoresume,load_ignore_keys,use_symlink,use_tp,use_hsdp',
     [
-        pytest.param(2, False, 'adamw', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, True, 'adamw', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adam', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adamw', 'amp_fp16', False, None, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adamw', 'amp_bf16', True, None, False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adamw', 'amp_bf16', False, ['rng'], False, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adamw', 'amp_bf16', False, None, True, False, marks=pytest.mark.world_size(2)),
-        pytest.param(2, False, 'adamw', 'amp_bf16', False, None, False, True, marks=pytest.mark.world_size(4)),
+        pytest.param(False, 'adamw', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(True, 'adamw', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(False, 'adam', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(False, 'adamw', 'amp_fp16', False, None, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(False, 'adamw', 'amp_bf16', True, None, False, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(
+            False,
+            'adamw',
+            'amp_bf16',
+            False,
+            ['rng'],
+            False,
+            False,
+            False,
+            marks=pytest.mark.world_size(2),
+        ),
+        pytest.param(False, 'adamw', 'amp_bf16', False, None, True, False, False, marks=pytest.mark.world_size(2)),
+        pytest.param(False, 'adamw', 'amp_bf16', False, None, False, True, False, marks=pytest.mark.world_size(4)),
+        pytest.param(False, 'adamw', 'amp_bf16', False, None, False, False, True, marks=pytest.mark.world_size(4)),
     ],
 )
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
 def test_fsdp_partitioned_state_dict_load(
-    world_size,
     tmp_path: pathlib.Path,
     autoresume: bool,
     precision: str,
@@ -803,6 +825,7 @@ def test_fsdp_partitioned_state_dict_load(
     load_ignore_keys: Union[list[str], None],
     use_symlink: bool,
     use_tp: bool,
+    use_hsdp: bool,
     use_remote,
     s3_bucket,
     s3_ephemeral_prefix,
@@ -829,10 +852,19 @@ def test_fsdp_partitioned_state_dict_load(
 
     save_filename = 'ba{batch}-rank{rank}.pt'
 
-    fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}')
+    if use_hsdp:
+        fsdp_config = FSDPConfig(
+            sharding_strategy='HYBRID_SHARD',
+            sharded_ckpt_prefix_dir='ba{batch}',
+            state_dict_type='sharded',
+            data_parallel_shard_degree=2,
+            data_parallel_replicate_degree=2,
+            sync_module_states=True,
+        )
+    else:
+        fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}')
     tp_config = None
     if use_tp:
-        fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}')
         from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
         tp_config = {
             'tensor_parallel_degree': 2,

From 450130572ec77cb2327fbce96488ba4544dc79e3 Mon Sep 17 00:00:00 2001
From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com>
Date: Mon, 24 Jun 2024 17:57:08 -0700
Subject: [PATCH 37/69] bumping mlflow to 2.14.1 (#3425)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0afe4ed314..a1deea27c7 100644
--- a/setup.py
+++ b/setup.py
@@ -223,7 +223,7 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['mlflow'] = [
-    'mlflow>=2.11.1,<3.0',
+    'mlflow>=2.14.1,<3.0',
     'databricks-sdk==0.28.0',
     'pynvml>=11.5.0,<12',
 ]

From a7218d151f691649b846aaa32c7cbcd1fd6d90c4 Mon Sep 17 00:00:00 2001
From: Vincent Chen <vincent@mosaicml.com>
Date: Tue, 25 Jun 2024 16:42:37 -0400
Subject: [PATCH 38/69] Skip HSDP + TP pytests that require torch 2.3 or above
 (#3426)

* test

* skip if torch version less than 2.3

* typo in ema

* add remote

* comments

---------

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 tests/trainer/test_fsdp_checkpoint.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index 2e5fd5d07b..3b4f26024c 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -316,6 +316,8 @@ def test_fsdp_full_state_dict_load(
 ):
     if use_hsdp:
         pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch')
+    if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
+        pytest.skip('HSDP and TP require torch 2.3.0 or later')
     if autoresume:
         run_name = 'my-cool-autoresume-run'
     else:
@@ -833,8 +835,8 @@ def test_fsdp_partitioned_state_dict_load(
 ):
     if weights_only and autoresume:
         pytest.skip('Weights only with autoresume is not supported')
-    if use_tp and version.parse(torch.__version__) < version.parse('2.3.0'):
-        pytest.skip('TP requires torch 2.3.0 or later')
+    if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
+        pytest.skip('HSDP and TP require torch 2.3.0 or later')
 
     load_ignore_keys = [] if load_ignore_keys is None else load_ignore_keys
 

From 83618629ec09b9d919c1350781831e8196377ad2 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 26 Jun 2024 14:38:04 -0700
Subject: [PATCH 39/69] remove codeql (#3429)

---
 .github/workflows/codeql-analysis.yml | 52 ---------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 .github/workflows/codeql-analysis.yml

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
deleted file mode 100644
index 0cb835fbde..0000000000
--- a/.github/workflows/codeql-analysis.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-#
-# ******** NOTE ********
-# We have attempted to detect the languages in your repository. Please check
-# the `language` matrix defined below to confirm you have the correct set of
-# supported CodeQL languages.
-#
-name: "CodeQL"
-
-on:
-  push:
-    branches: [dev, main]
-  pull_request:
-    # The branches below must be a subset of the branches above
-    branches: [dev, main]
-  schedule:
-  - cron: "0 9 * * 1"  # Every Monday at 09:00 (9:00 AM)
-
-jobs:
-  analyze:
-    name: Analyze
-    runs-on: ubuntu-latest
-    permissions:
-      actions: read
-      contents: read
-      security-events: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        language: ["python"]
-        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript',
-        # 'python', 'ruby' ]
-        # Learn more about CodeQL language support at
-        # https://git.io/codeql-language-support
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v2
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/ci-testing
-        ref: v0.0.8
-        path: ./ci-testing
-    - uses: ./ci-testing/.github/actions/codeql-analysis
-      with:
-        language: ${{ matrix.language }}

From 0b749339e3040362e6c9740a2e68276d41f0258d Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 27 Jun 2024 09:56:20 -0700
Subject: [PATCH 40/69] Remove save overwrite (#3431)

* remove save overwrite

* fix tests

* lint

* remove bad test
---
 composer/trainer/trainer.py      |  5 -----
 tests/trainer/test_checkpoint.py | 26 +++++++++++++++++---------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 91dd0b1e19..f5a6b57d77 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1732,11 +1732,6 @@ def __init__(
             error_message = ''
             if save_folder is None:
                 error_message += 'The `save_folder` must be specified when autoresume is enabled. '
-            if save_overwrite:
-                error_message += textwrap.dedent(
-                    'The flag `save_overwrite` must be False when autoresume is enabled as autoresume always loads the '
-                    'latest existing checkpoint in `save_folder`. ',
-                )
             if save_latest_filename is None:
                 error_message += 'The `save_latest_filename` must be specified so autoresume knows where to load checkpoints from. '
             if error_message != '':
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index d23b55875f..9912563eb8 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -730,11 +730,19 @@ def get_logger(self, tmp_path: pathlib.Path):
 
     @world_size(1, 2)
     @device('cpu', 'gpu')
-    @pytest.mark.parametrize('file_extension', ['.pt', '.tar.gz', '.pt.lz4'])
     @pytest.mark.parametrize('use_object_store', [True, False])
     @pytest.mark.parametrize('delete_local', [True, False])
     @pytest.mark.parametrize('test_slashed', [True, False])
-    @pytest.mark.parametrize('save_metrics', [True, False])
+    @pytest.mark.parametrize(
+        'file_extension,save_metrics,save_overwrite',
+        [
+            ['.pt', False, False],
+            ['.tar.gz', False, False],
+            ['.pt.lz4', False, False],
+            ['.pt', True, False],
+            ['.pt', False, True],
+        ],
+    )
     def test_autoresume(
         self,
         device: str,
@@ -744,6 +752,7 @@ def test_autoresume(
         delete_local: bool,
         test_slashed: bool,
         save_metrics: bool,
+        save_overwrite: bool,
         world_size: int,
     ):
         if delete_local and not use_object_store:
@@ -786,6 +795,7 @@ def test_autoresume(
             autoresume=True,
             load_path='ignore_me.pt',  # this should be ignored
             load_ignore_keys=['*'],  # this should be ignored
+            save_overwrite=save_overwrite,
             loggers=[self.get_logger(tmp_path)] if use_object_store else [],
         )
 
@@ -1212,19 +1222,17 @@ def test_load_weights_object_store(self, tmp_path):
         )
 
     @pytest.mark.parametrize(
-        'run_name,save_folder,save_overwrite,latest_filename',
+        'run_name,save_folder,latest_filename',
         [
-            [None, 'first', False, 'latest-rank{rank}.pt'],
-            ['big-chungus', None, False, 'latest-rank{rank}.pt'],
-            ['big-chungus', 'first', True, 'latest-rank{rank}.pt'],
-            ['big-chungus', 'first', False, None],
+            [None, 'first', 'latest-rank{rank}.pt'],
+            ['big-chungus', None, 'latest-rank{rank}.pt'],
+            ['big-chungus', 'first', None],
         ],
     )
-    def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename):
+    def test_autoresume_fail(self, run_name, save_folder, latest_filename):
         with pytest.raises(ValueError):
             self.get_trainer(
                 latest_filename=latest_filename,
-                save_overwrite=save_overwrite,
                 save_folder=save_folder,
                 run_name=run_name,
                 autoresume=True,

From dd3e7f904a2f5786559518863f7a7a47718cec28 Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <saaketh@mosaicml.com>
Date: Fri, 28 Jun 2024 11:24:35 -0700
Subject: [PATCH 41/69] LeDocs (#3430)

---
 docs/source/notes/distributed_training.rst | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/source/notes/distributed_training.rst b/docs/source/notes/distributed_training.rst
index 192167c935..9422e4280b 100644
--- a/docs/source/notes/distributed_training.rst
+++ b/docs/source/notes/distributed_training.rst
@@ -540,23 +540,24 @@ Composer integrates Pytorch's `Tensor Parallel <https://pytorch.org/docs/stable/
 API with some syntactic sugar to make it easy to write custom models that work with Composer + TP.
 
 To enable Tensor Parallel, a tensor parallel config must be passed to the Composer Trainer. The
-full spec and defaults for Composer's tensor parallelism_config is here:
+full spec and defaults for Composer's tensor parallelism config is here:
 
 .. code:: python
 
     tp_config = {
         tensor_parallel_degree: int = 1, # Default: 1
-        pipeline_parallel_degree: int = 1, # Default: None
+        layer_plan: dict = None, # Default: None, maps to torch's `parallelize_plan`
     }
 
 All values come with defaults and can be optionally defined in the :code:`tp_config`. Most parameters
 map directly to parameters in the
 `Tensor Parallel documentation <https://pytorch.org/docs/stable/distributed.tensor.parallel.html#torch.distributed.tensor.parallel.parallelize_module>`__.
-This config is passed under `parallelism_config['tp']` to the Composer Trainer. An important parameters
-which do not map include `tensor_parallel_degree`, which dictates the number of devices to shard across.
+This config is passed under `parallelism_config['tp']` to the Composer Trainer. Important parameters
+which do not directly map include `tensor_parallel_degree`, which dictates the number of devices to shard across,
+and `layer_plan`, which simply corresponds to torch's `parallelize_plan`.
 
 
-An example code snippet for using FSDP with composer is provided below:
+An example code snippet for using TP and FSDP with Composer is provided below:
 
 .. code:: python
 
@@ -624,10 +625,12 @@ An example code snippet for using FSDP with composer is provided below:
         }
     }
 
-
     trainer = Trainer(
         model=composer_model,
-        parallelism_config={'fsdp': fsdp_config},
+        parallelism_config={
+            'fsdp': fsdp_config,
+            'tp': tp_config,
+        },
         ...
     )
 

From ac4bd59d130c47238427467a0b79d724224a351d Mon Sep 17 00:00:00 2001
From: Chen Qian <chen.qian@databricks.com>
Date: Fri, 28 Jun 2024 15:06:12 -0700
Subject: [PATCH 42/69] Lower the system metrics logging frequency to reduce
 MLflow server's load (#3436)

* lower the system metrics logging frequency

* more frequent
---
 composer/loggers/mlflow_logger.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 526a7962fd..c90f167b82 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -123,6 +123,13 @@ def __init__(
         if logging_buffer_seconds:
             os.environ['MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'] = str(logging_buffer_seconds)
 
+        if log_system_metrics:
+            # Set system metrics sampling interval and samples before logging so that system metrics
+            # are collected every 5s, and aggregated over 3 samples before being logged
+            # (logging per 15s).
+            mlflow.set_system_metrics_samples_before_logging(3)
+            mlflow.set_system_metrics_sampling_interval(5)
+
         self._rank_zero_only = rank_zero_only
         self._last_flush_time = time.time()
         self._flush_interval = flush_interval

From 38e5e515f7f569b833cf118c4f7bf17646bb9e5b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Jul 2024 07:39:48 -0700
Subject: [PATCH 43/69] Update paramiko requirement from <3,>=2.11.0 to
 >=3.4.0,<4 (#3439)

Updates the requirements on [paramiko](https://github.com/paramiko/paramiko) to permit the latest version.
- [Commits](https://github.com/paramiko/paramiko/compare/2.11.0...3.4.0)

---
updated-dependencies:
- dependency-name: paramiko
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a1deea27c7..768f6655ea 100644
--- a/setup.py
+++ b/setup.py
@@ -202,7 +202,7 @@ def package_files(prefix: str, directory: str, extension: str):
 extra_deps['streaming'] = [
     'mosaicml-streaming<1.0',
     'boto3>=1.21.45,<2',
-    'paramiko>=2.11.0,<3',
+    'paramiko>=3.4.0,<4',
 ]
 
 extra_deps['libcloud'] = [

From 6b461d0333cece7a6e537def54076de8389dbb81 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 1 Jul 2024 08:17:14 -0700
Subject: [PATCH 44/69] bump versions (#3433)

---
 .github/workflows/code-quality.yaml | 2 +-
 .github/workflows/coverage.yaml     | 2 +-
 .github/workflows/daily.yaml        | 4 ++--
 .github/workflows/pr-cpu.yaml       | 2 +-
 .github/workflows/pr-gpu.yaml       | 6 +++---
 .github/workflows/release.yaml      | 2 +-
 .github/workflows/smoketest.yaml    | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index c35546f4ca..432e031cb4 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -34,7 +34,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.8
+        ref: v0.0.9
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index 9432e8c6c9..fc511d7e60 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -16,7 +16,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.8
+        ref: v0.0.9
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/coverage
       with:
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 6b67e857ec..aa97c755c8 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -14,7 +14,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   daily-pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9
     strategy:
       matrix:
         include:
@@ -100,7 +100,7 @@ jobs:
       download-path: artifacts
 
   daily-pytest-gpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
     strategy:
       matrix:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 12f471749e..23129715db 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index f056292a43..f6de8908c1 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
     strategy:
       matrix:
         include:
@@ -35,7 +35,7 @@ jobs:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
 
   pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
     strategy:
       matrix:
         include:
@@ -62,7 +62,7 @@ jobs:
 
 
   pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8
+    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
     strategy:
       matrix:
         include:
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 0b253ea87f..c841e6c150 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -24,7 +24,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.8
+        ref: v0.0.9
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index e9c6316a8d..b7bb09aaab 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -33,7 +33,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.8
+        ref: v0.0.9
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/smoketest
       with:

From 6bac335bf95c848414688cd3013826e111463c2e Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 1 Jul 2024 09:47:33 -0700
Subject: [PATCH 45/69] fix eval after all (#3445)

---
 composer/core/event.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/composer/core/event.py b/composer/core/event.py
index 1374a32c3f..e88d24109e 100644
--- a/composer/core/event.py
+++ b/composer/core/event.py
@@ -57,7 +57,7 @@ class Event(StringEnum):
 
                     # <BATCH_END>
 
-                    # <BEFORE_EVAL_ALL>
+                    # <EVAL_BEFORE_ALL>
                     for eval_dataloader in eval_dataloaders:
                         if should_eval(batch=True):
                             # <EVAL_START>
@@ -70,7 +70,7 @@ class Event(StringEnum):
                                 # <EVAL_BATCH_END>
                             # <EVAL_END>
 
-                    # <AFTER_EVAL_ALL>
+                    # <EVAL_AFTER_ALL>
 
                     # <BATCH_CHECKPOINT>
                 # <EPOCH_END>

From 3cd6e6de5f6506063d99f0945a59a8d3917fe1b1 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 1 Jul 2024 10:08:06 -0700
Subject: [PATCH 46/69] skip log (#3446)

---
 composer/loggers/mosaicml_logger.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py
index cbbcd285c8..8b4ff5942a 100644
--- a/composer/loggers/mosaicml_logger.py
+++ b/composer/loggers/mosaicml_logger.py
@@ -146,7 +146,8 @@ def predict_end(self, state: State, logger: Logger) -> None:
         self._flush_metadata(force_flush=True)
 
     def close(self, state: State, logger: Logger) -> None:
-        self._flush_metadata(force_flush=True, future=False)
+        # Skip flushing metadata as it should be logged by fit/eval/predict_end. Flushing here
+        # might schedule futures while interpreter is shutting down, which will raise an error.
         if self._enabled:
             wait(self._futures)  # Ignore raised errors on close
 

From cf76c96d1462a801f31f41d82c56a47a84724999 Mon Sep 17 00:00:00 2001
From: Anna <anna@mosaicml.com>
Date: Mon, 1 Jul 2024 11:38:13 -0700
Subject: [PATCH 47/69] Remove MosaicMLLambdaEvalClient (#3432)

---
 composer/utils/__init__.py                    |  3 +-
 composer/utils/eval_client/__init__.py        |  2 -
 .../mosaicml_lambda_eval_client.py            | 82 -------------------
 .../eval_client/test_mcli_eval_client.py      | 42 ----------
 4 files changed, 1 insertion(+), 128 deletions(-)
 delete mode 100644 composer/utils/eval_client/mosaicml_lambda_eval_client.py
 delete mode 100644 tests/utils/eval_client/test_mcli_eval_client.py

diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py
index 9618d5f837..f04da5c0e8 100644
--- a/composer/utils/__init__.py
+++ b/composer/utils/__init__.py
@@ -30,7 +30,7 @@
     is_compressed_pt,
 )
 from composer.utils.device import get_device, is_hpu_installed, is_xla_installed
-from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
+from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient
 from composer.utils.file_helpers import (
     FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
     FORMAT_NAME_WITH_DIST_TABLE,
@@ -140,7 +140,6 @@
     'EvalClient',
     'LambdaEvalClient',
     'LocalEvalClient',
-    'MosaicMLLambdaEvalClient',
     'partial_format',
     'add_vision_dataset_transform',
     'VersionedDeprecationWarning',
diff --git a/composer/utils/eval_client/__init__.py b/composer/utils/eval_client/__init__.py
index 95b780043a..98bcdd87dc 100644
--- a/composer/utils/eval_client/__init__.py
+++ b/composer/utils/eval_client/__init__.py
@@ -6,11 +6,9 @@
 from composer.utils.eval_client.eval_client import EvalClient
 from composer.utils.eval_client.lambda_eval_client import LambdaEvalClient
 from composer.utils.eval_client.local_eval_client import LocalEvalClient
-from composer.utils.eval_client.mosaicml_lambda_eval_client import MosaicMLLambdaEvalClient
 
 __all__ = [
     'EvalClient',
     'LambdaEvalClient',
     'LocalEvalClient',
-    'MosaicMLLambdaEvalClient',
 ]
diff --git a/composer/utils/eval_client/mosaicml_lambda_eval_client.py b/composer/utils/eval_client/mosaicml_lambda_eval_client.py
deleted file mode 100644
index b0418bf86a..0000000000
--- a/composer/utils/eval_client/mosaicml_lambda_eval_client.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""MCLI compatible eval client."""
-import logging
-import os
-import time
-from http import HTTPStatus
-
-import mcli
-import numpy as np
-
-from composer.utils.eval_client.eval_client import EvalClient
-
-__all__ = ['MosaicMLLambdaEvalClient']
-log = logging.getLogger(__name__)
-
-
-class MosaicMLLambdaEvalClient(EvalClient):
-    """Utility for creating a client for and invoking an AWS Lambda through MCLI."""
-
-    def __init__(self, backoff: int = 3, num_retries: int = 5) -> None:
-        """Checks that the requisite environment variables are in the EvalClient.
-
-        `MOSAICML_ACCESS_TOKEN_ENV_VAR` environment variable must be set to access the platform.
-        """
-        from composer.loggers.mosaicml_logger import \
-            MOSAICML_ACCESS_TOKEN_ENV_VAR  # in-line import to avoid circular import
-
-        if MOSAICML_ACCESS_TOKEN_ENV_VAR not in os.environ:
-            raise RuntimeError('Cannot use MosaicML Lambda Client Eval without setting MOSAICML_ACCESS_TOKEN_ENV_VAR.')
-        log.debug('Running code eval through MosaicMLLambdaEvalClient.')
-        self.backoff = backoff
-        self.num_retries = num_retries
-
-    def invoke(self, payload: list[list[list[dict[str, str]]]]) -> list[list[list[bool]]]:
-        """Invoke a batch of provided payloads for code evaluations."""
-        num_beams = len(payload[0])
-        num_tests = [len(generation_payload[0]) for generation_payload in payload]
-        cum_tests = (np.cumsum([0] + num_tests[:-1]) * num_beams).tolist()
-        test_cases = [
-            test_case for generation_payload in payload for beam_payload in generation_payload
-            for test_case in beam_payload
-        ]
-        ret_helper = [False] * len(test_cases)
-        for i in range(self.num_retries):
-            try:
-                ret_helper = mcli.get_code_eval_output(test_cases).data  # pyright: ignore[reportGeneralTypeIssues]
-                break
-            except mcli.MAPIException as e:
-                if e.status >= 500:
-                    if i == self.num_retries - 1:
-                        log.error(f'Failed to get code eval output after {self.num_retries} retries. Error: {e}')
-                    log.warning(f'Failed to get code eval output, retrying in {self.backoff**i} seconds.')
-                    time.sleep(self.backoff**i)
-                elif e.status == HTTPStatus.UNAUTHORIZED:
-                    raise RuntimeError(
-                        'Failed to get code eval output due to UNAUTHORIZED error. '
-                        'Please ensure you have access to MosaicMLLambdaEvalClient.',
-                    ) from e
-                else:
-                    log.error(f'Failed to get code eval output with unexpected MAPIException. Error: {e}')
-                    break
-            except TimeoutError as e:
-                if i == self.num_retries - 1:
-                    log.error(f'Failed to get code eval output after {self.num_retries} retries. Error: {e}')
-                log.warning(f'Failed to get code eval output, retrying in {self.backoff**i} seconds.')
-                time.sleep(self.backoff**i)
-            except Exception as e:
-                log.error(f'Failed to get code eval output with unexpected error. Error: {e}')
-                break
-
-        ret = []
-        for i in range(len(payload)):
-            ret_payload = []
-            for j in range(num_beams):
-                ret_num_beams = []
-                for k in range(num_tests[i]):
-                    ret_num_beams.append(ret_helper[cum_tests[i] + j * num_tests[i] + k])
-                ret_payload.append(ret_num_beams)
-            ret.append(ret_payload)
-        return ret
diff --git a/tests/utils/eval_client/test_mcli_eval_client.py b/tests/utils/eval_client/test_mcli_eval_client.py
deleted file mode 100644
index 56f13524a1..0000000000
--- a/tests/utils/eval_client/test_mcli_eval_client.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-from composer.utils import MosaicMLLambdaEvalClient
-
-
-@pytest.mark.remote
-@pytest.mark.gpu  # must run on MosaicML platform
-@pytest.mark.parametrize(
-    'code, result, language',
-    [
-        ['def add_1(x):\n    return x + 1', True, 'python'],
-        ['def add_1(x):\n    return y + 1', False, 'python'],
-        ['def add_1(x):\n    while True:\n        x += 1', False, 'python'],
-        ['def add_1(x): return x + 2', False, 'python'],
-        ['int add_1(int x) {\n\treturn x + 1;\n}', True, 'c++'],
-        ['int add_1(int x) {\n\treturn y + 1;\n}', False, 'c++'],
-        ['int add_1(int x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'c++'],
-        ['int add_1(int x) {\n\treturn x + 2;\n}', False, 'c++'],
-        ['int add_1(int x) {\n\treturn x + 1;\n}', True, 'c'],
-        ['int add_1(int x) {\n\treturn y + 1;\n}', False, 'c'],
-        ['int add_1(int x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'c'],
-        ['int add_1(int x) {\n\treturn x + 2;\n}', False, 'c'],
-        ['function add_1(x) {\n\treturn x+1;\n}', True, 'javascript'],
-        ['function add_1(x) {\n\treturn y+1;\n}', False, 'javascript'],
-        ['function add_1(x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'javascript'],
-        ['function add_1(x) {\n\treturn x+2;\n}', False, 'javascript'],
-    ],
-)
-def test_mcli_invoke(code, result, language):
-    """Test invocation function for MosaicMLLambdaEvalClient with code that succeeds, fails compilation, times out, and is incorrect in C, C++, Python, JS.
-    """
-    eval_client = MosaicMLLambdaEvalClient()
-    input = '(1,)' if language == 'python' else '1'
-    assert eval_client.invoke([[[{
-        'code': code,
-        'input': input,
-        'output': '2',
-        'entry_point': 'add_1',
-        'language': language,
-    }]]]) == [[[result]]]

From 8fbca389a23a0686a7a7b63c9af4455d83c47a33 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 1 Jul 2024 13:22:20 -0700
Subject: [PATCH 48/69] Relax hf hub pin (#3435)

---
 pyproject.toml                | 2 --
 setup.py                      | 4 ++--
 tests/test_full_nlp.py        | 2 +-
 tests/utils/test_inference.py | 7 +++++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3b2469b935..8ca97bc494 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -160,8 +160,6 @@ filterwarnings = [
     '''ignore:.*an autograd kernel was not registered to the Autograd key.*:UserWarning''',
     # Ignore save_state_dict / load_state_dict deprecation warnings
     '''ignore:'.*_state_dict' is deprecated and will be removed in future versions.*:UserWarning''',
-    # Ignore HF deprecation which affects their own libraries
-    '''ignore:'.*`resume_download` is deprecated and will be removed in version.*:FutureWarning'''
 ]
 
 # Coverage
diff --git a/setup.py b/setup.py
index 768f6655ea..4bee19e1cf 100644
--- a/setup.py
+++ b/setup.py
@@ -179,9 +179,9 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['nlp'] = [
-    'transformers>=4.11,!=4.34.0,<4.42',
+    'transformers>=4.11,!=4.34.0,<4.43',
     'datasets>=2.4,<3',
-    'huggingface-hub>=0.21.2,<0.23',
+    'huggingface-hub>=0.21.2,<0.24',
 ]
 
 extra_deps['peft'] = [
diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py
index 0ebb927c67..14380b38fe 100644
--- a/tests/test_full_nlp.py
+++ b/tests/test_full_nlp.py
@@ -237,7 +237,7 @@ def inference_test_helper(
         ('simpletransformer', [], 'torchscript'),
     ],
 )
-@pytest.mark.parametrize('onnx_opset_version', [13, None])
+@pytest.mark.parametrize('onnx_opset_version', [14, None])
 def test_full_nlp_pipeline(
     model_type,
     algorithms,
diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
index f1c45b8562..e7c374377d 100644
--- a/tests/utils/test_inference.py
+++ b/tests/utils/test_inference.py
@@ -106,7 +106,7 @@ def test_export_for_inference_input_and_output_names():
 
 
 @device('cpu', 'gpu')
-@pytest.mark.parametrize('onnx_opset_version', [13, None])
+@pytest.mark.parametrize('onnx_opset_version', [14, None])
 def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_config, device):
     pytest.importorskip('onnx')
     pytest.importorskip('onnxruntime')
@@ -130,7 +130,10 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con
     input_ids = torch.randint(low=0, high=30522, size=(2, 32))
     labels = torch.randint(low=0, high=1, size=(2,))
     token_type_ids = torch.zeros(size=(2, 32), dtype=torch.int64)
-    attention_mask = torch.randint(low=0, high=1, size=(2, 32))
+    attention_mask = torch.ones(size=(2, 32), dtype=torch.int64)
+    # Mask some tokens
+    attention_mask[0, 2:] = 0
+
     sample_input = {
         'input_ids': input_ids,
         'labels': labels,

From 54d58c962ed9a513a3cd64caf289cdfcd0bbd8c8 Mon Sep 17 00:00:00 2001
From: Vincent Chen <vincent@mosaicml.com>
Date: Mon, 1 Jul 2024 17:34:31 -0700
Subject: [PATCH 49/69] Pytest skip 2 (#3448)

* test

* test

* test

* test

* test

* test

* fix

* sleep before skip

* fix

* pull request target

* revert

* revery pr_target branches

* sleep 1

* 10 sec

* uncomment

* dist barrier

* test

* dist works!

* update 0.0.9

* mihir comment

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

---------

Co-authored-by: v-chen_data <v-chen_data@example.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 tests/trainer/test_fsdp_checkpoint.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index 3b4f26024c..154ed6b282 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -836,6 +836,7 @@ def test_fsdp_partitioned_state_dict_load(
     if weights_only and autoresume:
         pytest.skip('Weights only with autoresume is not supported')
     if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
+        dist.barrier()  # Sync to avoid race conditions on cleaning up tmp_path
         pytest.skip('HSDP and TP require torch 2.3.0 or later')
 
     load_ignore_keys = [] if load_ignore_keys is None else load_ignore_keys

From 5a129d1d279b3b20a6399d6593da471ac550e631 Mon Sep 17 00:00:00 2001
From: Xiaohan Zhang <xiaohanzhang.cmu@gmail.com>
Date: Tue, 2 Jul 2024 07:56:56 -0600
Subject: [PATCH 50/69] bump version (#3450)

---
 composer/_version.py            |  2 +-
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 12 ++++++------
 docker/generate_build_matrix.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/composer/_version.py b/composer/_version.py
index a38b61a722..82928466f9 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.24.0.dev0'
+__version__ = '0.23.5'
diff --git a/docker/README.md b/docker/README.md
index e10af0a194..a0514ecb3d 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.23.3             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.3`         |
-| 0.23.3             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.3_cpu` |
+| 0.23.5             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.5`         |
+| 0.23.5             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.5_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index faa21b8e89..ee74d12309 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -208,9 +208,9 @@
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: composer-0-23-3
+  IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -231,15 +231,15 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.3
+  - mosaicml/composer:0.23.5
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-23-3-cpu
+  IMAGE_NAME: composer-0-23-5-cpu
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -247,7 +247,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.3_cpu
+  - mosaicml/composer:0.23.5_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index 9a634b0d36..74d9c7fed4 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -231,7 +231,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.23.3']  # Only build images for the latest composer version
+    composer_versions = ['0.23.5']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

From a0806f6bfa4320cbccdeafa2d934ee7516cb8981 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Jul 2024 07:47:40 -0700
Subject: [PATCH 51/69] Bump ipykernel from 6.29.2 to 6.29.5 (#3459)

Bumps [ipykernel](https://github.com/ipython/ipykernel) from 6.29.2 to 6.29.5.
- [Release notes](https://github.com/ipython/ipykernel/releases)
- [Changelog](https://github.com/ipython/ipykernel/blob/v6.29.5/CHANGELOG.md)
- [Commits](https://github.com/ipython/ipykernel/compare/v6.29.2...v6.29.5)

---
updated-dependencies:
- dependency-name: ipykernel
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4bee19e1cf..6508aad307 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'fasteners==0.18',  # object store tests require fasteners
     'pytest==7.4.4',
     'ipython==8.11.0',
-    'ipykernel==6.29.2',
+    'ipykernel==6.29.5',
     'jupyter==1.0.0',
     'yamllint==1.35.1',
     'recommonmark==0.7.1',

From 4b71141da41a7ea4c8e1737868ca0ec365a473bb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Jul 2024 07:48:06 -0700
Subject: [PATCH 52/69] Update torchmetrics requirement (#3460)

Updates the requirements on [torchmetrics](https://github.com/Lightning-AI/torchmetrics) to permit the latest version.
- [Release notes](https://github.com/Lightning-AI/torchmetrics/releases)
- [Changelog](https://github.com/Lightning-AI/torchmetrics/blob/master/CHANGELOG.md)
- [Commits](https://github.com/Lightning-AI/torchmetrics/compare/v0.10.0...v1.4.0.post0)

---
updated-dependencies:
- dependency-name: torchmetrics
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6508aad307..b308678902 100644
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ def package_files(prefix: str, directory: str, extension: str):
 install_requires = [
     'pyyaml>=6.0,<7',
     'tqdm>=4.62.3,<5',
-    'torchmetrics>=0.10.0,<1.3.3',
+    'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
     'torchvision>=0.13.1,<0.18.2',
     'torch>=2.1.2,<2.3.2',

From 89db4e2591106c9b69dc7364185bcdad7ca7533e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Jul 2024 07:49:04 -0700
Subject: [PATCH 53/69] Bump databricks-sdk from 0.28.0 to 0.29.0 (#3456)

Bumps [databricks-sdk](https://github.com/databricks/databricks-sdk-py) from 0.28.0 to 0.29.0.
- [Release notes](https://github.com/databricks/databricks-sdk-py/releases)
- [Changelog](https://github.com/databricks/databricks-sdk-py/blob/main/CHANGELOG.md)
- [Commits](https://github.com/databricks/databricks-sdk-py/compare/v0.28.0...v0.29.0)

---
updated-dependencies:
- dependency-name: databricks-sdk
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b308678902..8f8498392d 100644
--- a/setup.py
+++ b/setup.py
@@ -224,13 +224,13 @@ def package_files(prefix: str, directory: str, extension: str):
 
 extra_deps['mlflow'] = [
     'mlflow>=2.14.1,<3.0',
-    'databricks-sdk==0.28.0',
+    'databricks-sdk==0.29.0',
     'pynvml>=11.5.0,<12',
 ]
 
 extra_deps['pandas'] = ['pandas>=2.0.0,<3.0']
 
-extra_deps['databricks'] = ['databricks-sdk==0.28.0']
+extra_deps['databricks'] = ['databricks-sdk==0.29.0']
 
 extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps}
 

From 6df01ba6305f560bf507cacaa02b02e94a451487 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Mon, 8 Jul 2024 09:48:38 -0700
Subject: [PATCH 54/69] [Checkpoint] Fix symlink issue where symlink file
 uploaded before checkpoint files upload (#3376)

* a

* a

* a

* a

* a

* a

* a

* a

* fix test

* a

* a

* a

* a

* fix unit test

* a

* a

* a

* a

* a

* fix 2gpu unit test

* a

* a

* a

* a

* fix doctest

* a

* fix test and lint

* up

* a

* a

* a

* a

* a

* a

* a

* a

* address comments

* a

* a

* a

* a

* rerun test

* add logging

* remove debug comments

* comments

* a

* cleanup

* a

* linter

* lint

* Update composer/callbacks/checkpoint_saver.py

Co-authored-by: Evan Racah <evan.racah@databricks.com>

* commenst

* a

* fix test

* fix test

* comments

* a

---------

Co-authored-by: Evan Racah <evan.racah@databricks.com>
---
 composer/callbacks/checkpoint_saver.py        | 179 +++++++++--
 .../loggers/remote_uploader_downloader.py     |  59 +---
 composer/trainer/trainer.py                   |  23 +-
 composer/utils/__init__.py                    |   7 +
 composer/utils/file_helpers.py                |  16 +
 composer/utils/object_store/__init__.py       |   2 +
 composer/utils/object_store/utils.py          |  48 +++
 composer/utils/remote_uploader.py             | 165 +++++++++-
 docs/source/doctest_fixtures.py               |  25 +-
 .../test_remote_uploader_downloader.py        |  16 +-
 tests/trainer/test_checkpoint.py              | 283 ++++++++++--------
 tests/utils/test_remote_uploader.py           |  26 +-
 12 files changed, 607 insertions(+), 242 deletions(-)
 create mode 100644 composer/utils/object_store/utils.py

diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
index c17b874c21..29468e66c3 100644
--- a/composer/callbacks/checkpoint_saver.py
+++ b/composer/callbacks/checkpoint_saver.py
@@ -20,6 +20,8 @@
     FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
     FORMAT_NAME_WITH_DIST_TABLE,
     PartialFilePath,
+    RemoteFilesExistingCheckStatus,
+    RemoteUploader,
     checkpoint,
     create_interval_scheduler,
     create_symlink_file,
@@ -28,6 +30,7 @@
     format_name_with_dist,
     format_name_with_dist_and_time,
     is_model_deepspeed,
+    parse_uri,
     partial_format,
 )
 from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME
@@ -287,8 +290,13 @@ def __init__(
         num_checkpoints_to_keep: int = -1,
         weights_only: bool = False,
         ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
+        num_concurrent_uploads: int = 1,
+        upload_timeout_in_seconds: int = 3600,
     ):
-        folder = str(folder)
+        backend, _, local_folder = parse_uri(str(folder))
+        if local_folder == '':
+            local_folder = '.'
+
         filename = str(filename)
         remote_file_name = str(remote_file_name) if remote_file_name is not None else None
         latest_filename = str(latest_filename) if latest_filename is not None else None
@@ -304,10 +312,10 @@ def __init__(
         self.save_interval = save_interval
         self.last_checkpoint_batch: Optional[Time] = None
 
-        self.folder = folder
+        self.folder = local_folder
 
-        self.filename = PartialFilePath(filename.lstrip('/'), folder)
-        self.latest_filename = PartialFilePath(latest_filename.lstrip('/'), folder) if latest_filename else None
+        self.filename = PartialFilePath(filename.lstrip('/'), local_folder)
+        self.latest_filename = PartialFilePath(latest_filename.lstrip('/'), local_folder) if latest_filename else None
         self.remote_file_name = PartialFilePath(remote_file_name) if remote_file_name else None
         self.latest_remote_file_name = PartialFilePath(latest_remote_file_name) if latest_remote_file_name else None
 
@@ -320,6 +328,23 @@ def __init__(
 
         self.start_batch = None
 
+        self.remote_uploader = None
+        self.rank_saves_symlinks: bool = False
+        self.tmp_dir_for_symlink = tempfile.TemporaryDirectory()
+        self.num_concurrent_uploads = num_concurrent_uploads
+        self.upload_timeout_in_seconds = upload_timeout_in_seconds
+        # Allow unit test to override this to make it faster
+        self._symlink_upload_wait_before_next_try_in_seconds = 30.0
+        self.pid = os.getpid()
+        self.symlink_count = 0
+        self.symlink_upload_tasks = []
+
+        if backend != '':
+            self.remote_uploader = RemoteUploader(
+                remote_folder=str(folder),
+                num_concurrent_uploads=self.num_concurrent_uploads,
+            )
+
     def init(self, state: State, logger: Logger) -> None:
         # If MLFlowLogger is being used, format MLFlow-specific placeholders in the save folder and paths.
         # Assumes that MLFlowLogger comes before CheckpointSaver in the list of loggers.
@@ -346,9 +371,10 @@ def init(self, state: State, logger: Logger) -> None:
                         self.latest_remote_file_name.filename,
                         **mlflow_format_kwargs,
                     )
-
                 break
 
+        if self.remote_uploader is not None:
+            self.remote_uploader.init()
         folder = format_name_with_dist(self.folder, state.run_name)
         os.makedirs(folder, exist_ok=True)
 
@@ -410,6 +436,27 @@ def load_state_dict(self, state: dict[str, Any]):
                 load_timestamp.load_state_dict(timestamp_state)
                 self.all_saved_checkpoints_to_timestamp[save_filename] = load_timestamp
 
+    def _upload_checkpoint(
+        self,
+        remote_file_name: str,
+        local_file_name: str,
+        local_remote_file_names: list[str],
+        logger: Logger,
+    ):
+        if self.remote_uploader is not None:
+            self.remote_uploader.upload_file_async(
+                remote_file_name=remote_file_name,
+                file_path=pathlib.Path(local_file_name),
+                overwrite=self.overwrite,
+            )
+            local_remote_file_names.append(remote_file_name)
+        else:
+            logger.upload_file(
+                remote_file_name=remote_file_name,
+                file_path=local_file_name,
+                overwrite=self.overwrite,
+            )
+
     def _save_checkpoint(self, state: State, logger: Logger):
         self.last_checkpoint_batch = state.timestamp.batch
 
@@ -432,7 +479,14 @@ def _save_checkpoint(self, state: State, logger: Logger):
         )
         log.debug(f'Checkpoint locally saved to {saved_path}')
 
+        self.symlink_count += 1
+        # Remote checkpoint file names on this rank
+        local_remote_file_names = []
+        all_remote_filenames = []
+
         if not saved_path:  # not all ranks save
+            if self.remote_file_name is not None and self.remote_uploader is not None:
+                all_remote_filenames = dist.all_gather_object(local_remote_file_names)
             return
 
         metadata_local_file_path = None
@@ -443,6 +497,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 state.timestamp,
             )
 
+        self.rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled
         if self.latest_filename is not None and self.num_checkpoints_to_keep != 0:
             symlink = self.latest_filename.format(state, is_deepspeed)
             os.makedirs(os.path.dirname(symlink), exist_ok=True)
@@ -455,8 +510,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 src_path = str(pathlib.Path(saved_path).parent)
             else:
                 src_path = saved_path
-            this_rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled
-            if this_rank_saves_symlinks:
+            if self.rank_saves_symlinks:
                 os.symlink(os.path.relpath(src_path, os.path.dirname(symlink)), symlink)
 
         # if remote file name provided, upload the checkpoint
@@ -482,10 +536,11 @@ def _save_checkpoint(self, state: State, logger: Logger):
                         state.timestamp,
                     )
                     assert metadata_local_file_path is not None
-                    logger.upload_file(
+                    self._upload_checkpoint(
                         remote_file_name=metadata_remote_file_name,
-                        file_path=metadata_local_file_path,
-                        overwrite=self.overwrite,
+                        local_file_name=metadata_local_file_path,
+                        local_remote_file_names=local_remote_file_names,
+                        logger=logger,
                     )
             else:
                 remote_file_name = self.remote_file_name.format(
@@ -495,12 +550,20 @@ def _save_checkpoint(self, state: State, logger: Logger):
 
             log.debug(f'Uploading checkpoint to {remote_file_name}')
             try:
-                logger.upload_file(remote_file_name=remote_file_name, file_path=saved_path, overwrite=self.overwrite)
+                self._upload_checkpoint(
+                    remote_file_name=remote_file_name,
+                    local_file_name=saved_path,
+                    local_remote_file_names=local_remote_file_names,
+                    logger=logger,
+                )
             except FileExistsError as e:
                 raise FileExistsError(
                     f'Uploading checkpoint failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite checkpoints with Trainer, set save_overwrite to True.',
                 ) from e
 
+            if self.remote_uploader is not None:
+                all_remote_filenames = dist.all_gather_object(local_remote_file_names)
+
             # symlinks stay the same with sharded checkpointing
             if self.latest_remote_file_name is not None:
                 symlink_name = self.latest_remote_file_name.format(
@@ -509,17 +572,31 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 ).lstrip('/') + '.symlink'
 
                 # create and upload a symlink file
-                with tempfile.TemporaryDirectory() as tmpdir:
-                    symlink_filename = os.path.join(tmpdir, 'latest.symlink')
-                    # Sharded checkpoints for torch >2.0 use directories not files for load_paths
-                    if state.fsdp_sharded_state_dict_enabled:
-                        src_path = str(pathlib.Path(remote_file_name).parent)
+                symlink_filename = os.path.join(
+                    self.tmp_dir_for_symlink.name,
+                    f'latest.{self.symlink_count}.symlink',
+                )
+                # Sharded checkpoints for torch >2.0 use directories not files for load_paths
+                if state.fsdp_sharded_state_dict_enabled:
+                    src_path = str(pathlib.Path(remote_file_name).parent)
+                else:
+                    src_path = remote_file_name
+                log.debug(f'Creating symlink file {symlink_filename} -> {src_path}')
+                if self.rank_saves_symlinks:
+                    create_symlink_file(src_path, symlink_filename)
+                    if self.remote_uploader is not None:
+                        remote_checkpoint_file_names = []
+                        for file_names in all_remote_filenames:
+                            remote_checkpoint_file_names += file_names
+                        check_remote_files_exist_future = self.remote_uploader.check_remote_files_exist_async(
+                            remote_checkpoint_file_names=remote_checkpoint_file_names,
+                            max_wait_time_in_seconds=self.upload_timeout_in_seconds,
+                            wait_before_next_try_in_seconds=self._symlink_upload_wait_before_next_try_in_seconds,
+                        )
+                        self.symlink_upload_tasks.append(
+                            (check_remote_files_exist_future, symlink_filename, symlink_name),
+                        )
                     else:
-                        src_path = remote_file_name
-                    log.debug(f'Creating symlink file {symlink_filename} -> {src_path}')
-                    this_rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled
-                    if this_rank_saves_symlinks:
-                        create_symlink_file(src_path, symlink_filename)
                         logger.upload_file(
                             remote_file_name=symlink_name,
                             file_path=symlink_filename,
@@ -532,7 +609,6 @@ def _save_checkpoint(self, state: State, logger: Logger):
             self._rotate_checkpoints(sharding_enabled=state.fsdp_sharded_state_dict_enabled)
 
     def _rotate_checkpoints(self, sharding_enabled: bool = False):
-
         while len(self.saved_checkpoints) > self.num_checkpoints_to_keep:
             prefix_dir = None
             checkpoint_to_delete = self.saved_checkpoints.pop(0)
@@ -542,3 +618,62 @@ def _rotate_checkpoints(self, sharding_enabled: bool = False):
             else:
                 if dist.get_global_rank() == 0:
                     shutil.rmtree(prefix_dir)
+
+    def batch_end(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        if self.remote_uploader is None:
+            return
+        self.remote_uploader.check_workers()
+        if not self.rank_saves_symlinks:
+            return
+        undone_symlink_upload_tasks = []
+        for (check_remote_files_exist_future, local_symlink_file,
+             remote_symlink_file) in reversed(self.symlink_upload_tasks):
+            if not check_remote_files_exist_future.done():
+                undone_symlink_upload_tasks.insert(
+                    0,
+                    (check_remote_files_exist_future, local_symlink_file, remote_symlink_file),
+                )
+                continue
+            if check_remote_files_exist_future.done():
+                result = check_remote_files_exist_future.result()
+                if result == RemoteFilesExistingCheckStatus.EXIST:
+                    self.remote_uploader.upload_file_async(
+                        remote_file_name=remote_symlink_file,
+                        file_path=local_symlink_file,
+                        overwrite=True,
+                    )
+                    break
+                else:
+                    raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}')
+        self.symlink_upload_tasks = undone_symlink_upload_tasks
+
+    def fit_end(self, state: State, logger: Logger) -> None:
+        del state, logger  # unused
+        if self.remote_uploader is None:
+            return
+        log.info('Waiting for checkpoint uploading to finish')
+        self.remote_uploader.wait()
+        if self.rank_saves_symlinks and len(self.symlink_upload_tasks) > 0:
+            log.debug('Uploading symlink to the latest checkpoint')
+            # We only need to upload a symlink pointing to the latest checkpoint files, so we can ignore successful uploads of older checkpoints.
+            check_remote_files_exist_future, local_symlink_file, remote_symlink_file = self.symlink_upload_tasks[-1]
+            result = check_remote_files_exist_future.result()
+            if result == RemoteFilesExistingCheckStatus.EXIST:
+                symlink_upload_future = self.remote_uploader.upload_file_async(
+                    remote_file_name=remote_symlink_file,
+                    file_path=local_symlink_file,
+                    overwrite=True,
+                )
+                symlink_upload_future.result()
+            else:
+                raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}')
+        log.info('Checkpoint uploading finished!')
+
+    def post_close(self):
+        if self.remote_uploader is not None:
+            # Wait the symlink file upload to finish and close remote uploader
+            try:
+                self.remote_uploader.wait_and_close()
+            except Exception as e:
+                log.error(f'RemoteUploader run into exception {e}')
diff --git a/composer/loggers/remote_uploader_downloader.py b/composer/loggers/remote_uploader_downloader.py
index 981cc4c650..9378d5a8d4 100644
--- a/composer/loggers/remote_uploader_downloader.py
+++ b/composer/loggers/remote_uploader_downloader.py
@@ -25,19 +25,15 @@
 from composer.loggers import Logger, MosaicMLLogger
 from composer.loggers.logger_destination import LoggerDestination
 from composer.utils import (
-    GCSObjectStore,
-    LibcloudObjectStore,
     MLFlowObjectStore,
     ObjectStore,
     ObjectStoreTransientError,
-    OCIObjectStore,
-    S3ObjectStore,
-    SFTPObjectStore,
-    UCObjectStore,
+    build_remote_backend,
     dist,
     format_name_with_dist,
     get_file,
     retry,
+    validate_credentials,
 )
 from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX
 
@@ -50,37 +46,6 @@
 __all__ = ['RemoteUploaderDownloader']
 
 
-def _build_remote_backend(remote_backend_name: str, backend_kwargs: dict[str, Any]):
-    remote_backend_cls = None
-    remote_backend_name_to_cls = {
-        's3': S3ObjectStore,
-        'oci': OCIObjectStore,
-        'sftp': SFTPObjectStore,
-        'libcloud': LibcloudObjectStore,
-        'gs': GCSObjectStore,
-    }
-
-    # Handle `dbfs` backend as a special case, since it can map to either :class:`.UCObjectStore`
-    # or :class:`.MLFlowObjectStore`.
-    if remote_backend_name == 'dbfs':
-        path = backend_kwargs['path']
-        if path.startswith(MLFLOW_DBFS_PATH_PREFIX):
-            remote_backend_cls = MLFlowObjectStore
-        else:
-            # Validate if the path conforms to the requirements for UC volume paths
-            UCObjectStore.validate_path(path)
-            remote_backend_cls = UCObjectStore
-    else:
-        remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None)
-        if remote_backend_cls is None:
-            supported_remote_backends = list(remote_backend_name_to_cls.keys()) + ['dbfs']
-            raise ValueError(
-                f'The remote backend {remote_backend_name} is not supported. Please use one of ({supported_remote_backends})',
-            )
-
-    return remote_backend_cls(**backend_kwargs)
-
-
 class RemoteUploaderDownloader(LoggerDestination):
     r"""Logger destination that uploads (downloads) files to (from) a remote backend.
 
@@ -339,7 +304,7 @@ def __init__(
     def remote_backend(self) -> ObjectStore:
         """The :class:`.ObjectStore` instance for the main thread."""
         if self._remote_backend is None:
-            self._remote_backend = _build_remote_backend(self.remote_backend_name, self.backend_kwargs)
+            self._remote_backend = build_remote_backend(self.remote_backend_name, self.backend_kwargs)
         return self._remote_backend
 
     def init(self, state: State, logger: Logger) -> None:
@@ -359,7 +324,7 @@ def init(self, state: State, logger: Logger) -> None:
             retry(
                 ObjectStoreTransientError,
                 self.num_attempts,
-            )(lambda: _validate_credentials(self.remote_backend, file_name_to_test))()
+            )(lambda: validate_credentials(self.remote_backend, file_name_to_test))()
 
         # If the remote backend is an `MLFlowObjectStore`, the original path kwarg may have placeholders that can be
         # updated with information generated at runtime, i.e., the MLFlow experiment and run IDs. This information
@@ -635,20 +600,6 @@ def _remote_file_name(self, remote_file_name: str):
         return key_name
 
 
-def _validate_credentials(
-    remote_backend: ObjectStore,
-    remote_file_name_to_test: str,
-) -> None:
-    # Validates the credentials by attempting to touch a file in the bucket
-    # raises an error if there was a credentials failure.
-    with tempfile.NamedTemporaryFile('wb') as f:
-        f.write(b'credentials_validated_successfully')
-        remote_backend.upload_object(
-            object_name=remote_file_name_to_test,
-            filename=f.name,
-        )
-
-
 def _upload_worker(
     file_queue: Union[queue.Queue[tuple[str, str, bool]], multiprocessing.JoinableQueue[tuple[str, str, bool]]],
     completed_queue: Union[queue.Queue[str], multiprocessing.JoinableQueue[str]],
@@ -663,7 +614,7 @@ def _upload_worker(
     The worker will continuously poll ``file_queue`` for files to upload. Once ``is_finished`` is set, the worker will
     exit once ``file_queue`` is empty.
     """
-    remote_backend = _build_remote_backend(remote_backend_name, backend_kwargs)
+    remote_backend = build_remote_backend(remote_backend_name, backend_kwargs)
     while True:
         try:
             file_path_to_upload, remote_file_name, overwrite = file_queue.get(block=True, timeout=0.5)
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index f5a6b57d77..c752187ba6 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1387,16 +1387,6 @@ def __init__(
             mosaicml_logger = MosaicMLLogger()
             loggers.append(mosaicml_logger)
 
-        # Remote Uploader Downloader
-        # Keep the ``RemoteUploaderDownloader`` below client-provided loggers so the loggers init callbacks run before
-        # the ``RemoteUploaderDownloader`` init. This is necessary to use an ``MLFlowObjectStore`` to log objects to a
-        # run managed by an ``MLFlowLogger``, as the ``MLFlowObjectStore`` relies on the ``MLFlowLogger`` to initialize
-        # the active MLFlow run.
-        if save_folder is not None:
-            remote_ud = maybe_create_remote_uploader_downloader_from_uri(save_folder, loggers)
-            if remote_ud is not None:
-                loggers.append(remote_ud)
-
         # Logger
         self.logger = Logger(state=self.state, destinations=loggers)
 
@@ -1451,14 +1441,12 @@ def __init__(
             # path then we assume they just want their checkpoints saved directly in their
             # bucket.
             if parsed_save_folder == '':
-                folder = '.'
                 remote_file_name = save_filename
                 latest_remote_file_name = save_latest_filename
 
             # If they actually specify a path, then we use that for their local save path
             # and we prefix save_filename with that path for remote_file_name.
             else:
-                folder = parsed_save_folder
                 remote_file_name = str(Path(parsed_save_folder) / Path(save_filename))
                 if save_latest_filename is not None:
                     latest_remote_file_name = str(Path(parsed_save_folder) / Path(save_latest_filename))
@@ -1466,7 +1454,7 @@ def __init__(
                     latest_remote_file_name = None
 
             self._checkpoint_saver = CheckpointSaver(
-                folder=folder,
+                folder=save_folder,
                 filename=save_filename,
                 remote_file_name=remote_file_name,
                 latest_filename=save_latest_filename,
@@ -1889,14 +1877,17 @@ def _try_checkpoint_download(
         self,
         latest_checkpoint_path: str,
         save_latest_remote_file_name: str,
-        loggers: Sequence[LoggerDestination],
+        loggers: Sequence[Union[LoggerDestination, ObjectStore]],
         load_progress_bar: bool,
     ) -> None:
         """Attempts to download the checkpoint from the logger destinations."""
         log.debug(
             f'Trying to download {save_latest_remote_file_name} to {latest_checkpoint_path} on rank {dist.get_global_rank()}',
         )
-        for logger in loggers:
+        remote_destination = list(loggers)
+        if self._checkpoint_saver is not None and self._checkpoint_saver.remote_uploader is not None:
+            remote_destination.append(self._checkpoint_saver.remote_uploader.remote_backend)
+        for logger in remote_destination:
             try:
                 # Fetch from logger. If it succeeds, stop trying the rest of the loggers
                 get_file(
@@ -1938,7 +1929,7 @@ def _get_autoresume_checkpoint(
             f'Looking for autoresume checkpoint: {save_latest_remote_file_name} (remote), {latest_checkpoint_path} (local)',
         )
 
-        if self.state.deepspeed_enabled or self.state.fsdp_sharded_state_dict_enabled:
+        if self.state.deepspeed_enabled:
             # If latest checkpoint is not saved locally, try to fetch from loggers
             if not os.path.exists(latest_checkpoint_path):
                 log.debug(f'Attempting to download the checkpoint on to rank {dist.get_global_rank()}')
diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py
index f04da5c0e8..0850fd2bdd 100644
--- a/composer/utils/__init__.py
+++ b/composer/utils/__init__.py
@@ -44,6 +44,7 @@
     maybe_create_object_store_from_uri,
     maybe_create_remote_uploader_downloader_from_uri,
     parse_uri,
+    validate_credentials,
 )
 from composer.utils.import_helpers import MissingConditionalImportError, import_object
 from composer.utils.inference import ExportFormat, Transform, export_for_inference, export_with_logger, quantize_dynamic
@@ -72,8 +73,10 @@
     S3ObjectStore,
     SFTPObjectStore,
     UCObjectStore,
+    build_remote_backend,
 )
 from composer.utils.parallelism import FSDPConfig, ParallelismConfig, TPConfig, create_fsdp_config
+from composer.utils.remote_uploader import RemoteFilesExistingCheckStatus, RemoteUploader
 from composer.utils.retrying import retry
 from composer.utils.string_enum import StringEnum
 from composer.utils.warnings import VersionedDeprecationWarning
@@ -155,4 +158,8 @@
     'ParallelismConfig',
     'MLFLOW_EXPERIMENT_ID_FORMAT_KEY',
     'MLFLOW_RUN_ID_FORMAT_KEY',
+    'RemoteUploader',
+    'validate_credentials',
+    'build_remote_backend',
+    'RemoteFilesExistingCheckStatus',
 ]
diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py
index 2d14cc27ea..11d10328ea 100644
--- a/composer/utils/file_helpers.py
+++ b/composer/utils/file_helpers.py
@@ -49,6 +49,7 @@
     'maybe_create_object_store_from_uri',
     'maybe_create_remote_uploader_downloader_from_uri',
     'parse_uri',
+    'validate_credentials',
 ]
 
 
@@ -737,3 +738,18 @@ def create_symlink_file(
         raise ValueError('The symlink filename must end with .symlink.')
     with open(destination_filename, 'x') as f:
         f.write(existing_path)
+
+
+def validate_credentials(
+    remote_backend: ObjectStore,
+    remote_file_name_to_test: str,
+):
+    """Upload a tiny text file to test if the credentials are setup correctly."""
+    # Validates the credentials by attempting to touch a file in the bucket
+    # raises an error if there was a credentials failure.
+    with tempfile.NamedTemporaryFile('wb') as f:
+        f.write(b'credentials_validated_successfully')
+        remote_backend.upload_object(
+            object_name=remote_file_name_to_test,
+            filename=f.name,
+        )
diff --git a/composer/utils/object_store/__init__.py b/composer/utils/object_store/__init__.py
index 3c70257e08..6171013c2c 100644
--- a/composer/utils/object_store/__init__.py
+++ b/composer/utils/object_store/__init__.py
@@ -15,6 +15,7 @@
 from composer.utils.object_store.s3_object_store import S3ObjectStore
 from composer.utils.object_store.sftp_object_store import SFTPObjectStore
 from composer.utils.object_store.uc_object_store import UCObjectStore
+from composer.utils.object_store.utils import build_remote_backend
 
 __all__ = [
     'ObjectStore',
@@ -28,4 +29,5 @@
     'UCObjectStore',
     'MLFLOW_EXPERIMENT_ID_FORMAT_KEY',
     'MLFLOW_RUN_ID_FORMAT_KEY',
+    'build_remote_backend',
 ]
diff --git a/composer/utils/object_store/utils.py b/composer/utils/object_store/utils.py
new file mode 100644
index 0000000000..0d33774bc7
--- /dev/null
+++ b/composer/utils/object_store/utils.py
@@ -0,0 +1,48 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for working with object stores."""
+
+from typing import Any
+
+from composer.utils.object_store.gcs_object_store import GCSObjectStore
+from composer.utils.object_store.libcloud_object_store import LibcloudObjectStore
+from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX, MLFlowObjectStore
+from composer.utils.object_store.oci_object_store import OCIObjectStore
+from composer.utils.object_store.s3_object_store import S3ObjectStore
+from composer.utils.object_store.sftp_object_store import SFTPObjectStore
+from composer.utils.object_store.uc_object_store import UCObjectStore
+
+__all__ = ['build_remote_backend']
+
+
+def build_remote_backend(remote_backend_name: str, backend_kwargs: dict[str, Any]):
+    """Build object store given the backend name and kwargs."""
+    remote_backend_cls = None
+    remote_backend_name_to_cls = {
+        's3': S3ObjectStore,
+        'oci': OCIObjectStore,
+        'sftp': SFTPObjectStore,
+        'libcloud': LibcloudObjectStore,
+        'gs': GCSObjectStore,
+    }
+
+    # Handle `dbfs` backend as a special case, since it can map to either :class:`.UCObjectStore`
+    # or :class:`.MLFlowObjectStore`.
+    if remote_backend_name == 'dbfs':
+        path = backend_kwargs['path']
+        if path.startswith(MLFLOW_DBFS_PATH_PREFIX):
+            remote_backend_cls = MLFlowObjectStore
+        else:
+            # Validate if the path conforms to the requirements for UC volume paths
+            UCObjectStore.validate_path(path)
+            remote_backend_cls = UCObjectStore
+    else:
+        remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None)
+        if remote_backend_cls is None:
+            supported_remote_backends = list(remote_backend_name_to_cls.keys()) + ['dbfs']
+            raise ValueError(
+                f'The remote backend {remote_backend_name} is not supported. Please use one of ({supported_remote_backends})',
+            )
+
+    return remote_backend_cls(**backend_kwargs)
diff --git a/composer/utils/remote_uploader.py b/composer/utils/remote_uploader.py
index c26c73a319..33793e7c91 100644
--- a/composer/utils/remote_uploader.py
+++ b/composer/utils/remote_uploader.py
@@ -12,13 +12,20 @@
 import time
 import uuid
 from concurrent.futures import Future, ProcessPoolExecutor
-from typing import List
+from enum import Enum
+from typing import Any, Optional
 
-from composer.utils.dist import get_local_rank
+from composer.utils.dist import broadcast_object_list, get_global_rank, get_local_rank
 from composer.utils.file_helpers import (
-    maybe_create_object_store_from_uri,
+    parse_uri,
+    validate_credentials,
 )
-from composer.utils.object_store.object_store import ObjectStore, ObjectStoreTransientError
+from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX, MLFlowObjectStore
+from composer.utils.object_store.object_store import (
+    ObjectStore,
+    ObjectStoreTransientError,
+)
+from composer.utils.object_store.utils import build_remote_backend
 from composer.utils.retrying import retry
 
 log = logging.getLogger(__name__)
@@ -26,16 +33,55 @@
 __all__ = ['RemoteUploader']
 
 
+class RemoteFilesExistingCheckStatus(Enum):
+    EXIST = 1
+    TIMEOUT = 2
+    ERROR = 3
+
+
+def _check_remote_files_exists(
+    remote_backend_name: str,
+    backend_kwargs: dict[str, Any],
+    remote_checkpoint_file_names: list[str],
+    main_process_pid: int,
+    is_remote_upload_failed: multiprocessing.Event, # pyright: ignore[reportGeneralTypeIssues]
+    max_wait_time_in_seconds: int = 3600,
+    wait_before_next_try_in_seconds: float = 30,
+):
+    start_time = time.time()
+    object_store = build_remote_backend(remote_backend_name, backend_kwargs)
+
+    for remote_file_name in remote_checkpoint_file_names:
+        while True:
+            if is_remote_upload_failed.is_set():
+                log.debug(f'Stop symlink uploading since the checkpoint files uploading failed')
+                return RemoteFilesExistingCheckStatus.ERROR
+            # Return if parent process exits
+            try:
+                os.kill(main_process_pid, 0)
+            except OSError:
+                return RemoteFilesExistingCheckStatus.ERROR
+            try:
+                object_store.get_object_size(remote_file_name)
+                break
+            except Exception as e:
+                if not isinstance(e, FileNotFoundError):
+                    log.debug(f'Got exception {type(e)}: {str(e)} when accessing remote file {remote_file_name}')
+                time.sleep(wait_before_next_try_in_seconds)
+            if time.time() - start_time > max_wait_time_in_seconds:
+                return RemoteFilesExistingCheckStatus.TIMEOUT
+    return RemoteFilesExistingCheckStatus.EXIST
+
+
 def _upload_file_to_object_store(
-    remote_folder: str,
+    remote_backend_name: str,
+    backend_kwargs: dict[str, Any],
     remote_file_name: str,
     local_file_path: str,
     overwrite: bool,
     num_attempts: int,
 ) -> int:
-    object_store: ObjectStore = maybe_create_object_store_from_uri(
-        remote_folder,
-    )  # pyright: ignore[reportGeneralTypeIssues]
+    object_store = build_remote_backend(remote_backend_name, backend_kwargs)
 
     @retry(ObjectStoreTransientError, num_attempts=num_attempts)
     def upload_file(retry_index: int = 0):
@@ -72,6 +118,7 @@ class RemoteUploader:
     def __init__(
         self,
         remote_folder: str,
+        backend_kwargs: Optional[dict[str, Any]] = None,
         num_concurrent_uploads: int = 2,
         num_attempts: int = 3,
     ):
@@ -84,18 +131,80 @@ def __init__(
         # A folder to use for staging uploads
         self._tempdir = tempfile.TemporaryDirectory()
         self._upload_staging_folder = self._tempdir.name
+        self.remote_backend_name, self.remote_bucket_name, self.path = parse_uri(remote_folder)
 
-        self.num_attempts = num_attempts
+        self.backend_kwargs: dict[str, Any] = backend_kwargs if backend_kwargs is not None else {}
+        if self.remote_backend_name in ['s3', 'oci', 'gs'] and 'bucket' not in self.backend_kwargs:
+            self.backend_kwargs['bucket'] = self.remote_bucket_name
+        elif self.remote_backend_name == 'libcloud':
+            if 'container' not in self.backend_kwargs:
+                self.backend_kwargs['container'] = self.remote_bucket_name
+        elif self.remote_backend_name == 'azure':
+            self.remote_backend_name = 'libcloud'
+            self.backend_kwargs = {
+                'provider': 'AZURE_BLOBS',
+                'container': self.remote_bucket_name,
+                'key_environ': 'AZURE_ACCOUNT_NAME',
+                'secret_environ': 'AZURE_ACCOUNT_ACCESS_KEY',
+            }
+        elif self.remote_backend_name == 'dbfs':
+            self.backend_kwargs['path'] = self.path
+        elif self.remote_backend_name == 'wandb':
+            raise NotImplementedError(
+                f'There is no implementation for WandB via URI. Please use '
+                'WandBLogger with log_artifacts set to True.',
+            )
+        else:
+            raise NotImplementedError(
+                f'There is no implementation for the cloud backend {self.remote_backend_name} via URI. Please use '
+                'one of the supported object stores (s3, oci, gs, azure, dbfs).',
+            )
 
-        self.executor = ProcessPoolExecutor(
+        self.num_attempts = num_attempts
+        self._remote_backend: Optional[ObjectStore] = None
+        mp_context = multiprocessing.get_context('spawn')
+        self.upload_executor = ProcessPoolExecutor(
             max_workers=num_concurrent_uploads,
-            mp_context=multiprocessing.get_context('spawn'),
+            mp_context=mp_context,
         )
+        self.check_remote_files_exist_executor = ProcessPoolExecutor(
+            max_workers=2,
+            mp_context=mp_context,
+        )
+        self.is_remote_upload_failed = mp_context.Manager().Event()
 
         # Used internally to track the future status.
         # If a future completed successfully, we'll remove it from this list
         # when check_workers() or wait() is called
-        self.futures: List[Future] = []
+        self.futures: list[Future] = []
+
+        self.pid = os.getpid()
+
+    @property
+    def remote_backend(self) -> ObjectStore:
+        if self._remote_backend is None:
+            self._remote_backend = build_remote_backend(self.remote_backend_name, self.backend_kwargs)
+        return self._remote_backend
+
+    def init(self):
+        # If it's dbfs path like: dbfs:/databricks/mlflow-tracking/{mlflow_experiment_id}/{mlflow_run_id}/
+        # We need to fill out the experiment_id and run_id
+
+        if get_global_rank() == 0:
+
+            @retry(ObjectStoreTransientError, num_attempts=self.num_attempts)
+            def _validate_credential_with_retry():
+                validate_credentials(self.remote_backend, '.credentials_validated_successfully')
+
+            _validate_credential_with_retry()
+        if self.path.startswith(MLFLOW_DBFS_PATH_PREFIX):
+            if get_global_rank() == 0:
+                assert isinstance(self.remote_backend, MLFlowObjectStore)
+                self.path = self.remote_backend.get_dbfs_path(self.path)
+            path_list = [self.path]
+            broadcast_object_list(path_list, src=0)
+            self.path = path_list[0]
+            self.backend_kwargs['path'] = self.path
 
     def upload_file_async(
         self,
@@ -114,9 +223,10 @@ def upload_file_async(
         shutil.copy2(file_path, copied_path)
 
         # Async upload file
-        future = self.executor.submit(
+        future = self.upload_executor.submit(
             _upload_file_to_object_store,
-            remote_folder=self.remote_folder,
+            remote_backend_name=self.remote_backend_name,
+            backend_kwargs=self.backend_kwargs,
             remote_file_name=remote_file_name,
             local_file_path=copied_path,
             overwrite=overwrite,
@@ -132,12 +242,13 @@ def check_workers(self):
         1. if it completed with exception, raise that exception
         2. if it completed without exception, remove it from self.futures
         """
-        done_futures: List[Future] = []
+        done_futures: list[Future] = []
         for future in self.futures:
             if future.done():
                 # future.exception is a blocking call
                 exception_or_none = future.exception()
                 if exception_or_none is not None:
+                    self.is_remote_upload_failed.set()
                     raise exception_or_none
                 else:
                     done_futures.append(future)
@@ -153,6 +264,7 @@ def wait(self):
         for future in self.futures:
             exception_or_none = future.exception()
             if exception_or_none is not None:
+                self.is_remote_upload_failed.set()
                 raise exception_or_none
         self.futures = []
 
@@ -165,4 +277,25 @@ def wait_and_close(self):
         """
         # make sure all workers are either running, or completed successfully
         self.wait()
-        self.executor.shutdown(wait=True)
+        self.upload_executor.shutdown(wait=True)
+        self.check_remote_files_exist_executor.shutdown(wait=True)
+        log.debug('Finished all uploading tasks, closing RemoteUploader')
+
+    def check_remote_files_exist_async(
+        self,
+        remote_checkpoint_file_names: list[str],
+        max_wait_time_in_seconds: int = 3600,
+        wait_before_next_try_in_seconds: float = 30,
+    ):
+        future = self.check_remote_files_exist_executor.submit(
+            _check_remote_files_exists,
+            remote_backend_name=self.remote_backend_name,
+            backend_kwargs=self.backend_kwargs,
+            remote_checkpoint_file_names=remote_checkpoint_file_names,
+            main_process_pid=self.pid,
+            is_remote_upload_failed=self.is_remote_upload_failed,
+            max_wait_time_in_seconds=max_wait_time_in_seconds,
+            wait_before_next_try_in_seconds=wait_before_next_try_in_seconds,
+        )
+        self.futures.append(future)
+        return future
diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 553d8d9b60..f54d1f69e1 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -54,7 +54,7 @@
 from composer.loggers import RemoteUploaderDownloader
 from composer.models import ComposerModel as ComposerModel
 from composer.optim.scheduler import ConstantScheduler
-from composer.utils import LibcloudObjectStore
+from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
 try:
@@ -246,6 +246,29 @@ def _new_RemoteUploaderDownloader_init(self, fake_ellipses: None = None, **kwarg
 
 RemoteUploaderDownloader.__init__ = _new_RemoteUploaderDownloader_init  # type: ignore
 
+# Patch RemoteUploader __init__ function to replace arguments while preserving type
+_original_RemoteUploader_init = RemoteUploader.__init__
+
+
+def _new_RemoteUploader_init(self, fake_ellipses: None = None, **kwargs: Any):
+    os.makedirs('./object_store', exist_ok=True)
+    kwargs.update(
+        num_concurrent_uploads=1,
+        remote_folder='libcloud://.',
+        backend_kwargs={
+            'provider': 'local',
+            'container': '.',
+            'provider_kwargs': {
+                'key': os.path.abspath('./object_store'),
+            },
+        },
+        num_attempts=1,
+    )
+    _original_RemoteUploader_init(self, **kwargs)
+
+
+RemoteUploader.__init__ = _new_RemoteUploader_init
+
 # Patch ObjectStore __init__ function to replace arguments while preserving type
 _original_libcloudObjectStore_init = LibcloudObjectStore.__init__
 
diff --git a/tests/loggers/test_remote_uploader_downloader.py b/tests/loggers/test_remote_uploader_downloader.py
index 1f877d2dd9..b25e23a717 100644
--- a/tests/loggers/test_remote_uploader_downloader.py
+++ b/tests/loggers/test_remote_uploader_downloader.py
@@ -77,7 +77,7 @@ def object_store_test_helper(
     # Patching does not work when using multiprocessing with spawn, so we also
     # patch to use fork
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context):
             remote_uploader_downloader = RemoteUploaderDownloader(
                 bucket_uri='s3://{remote_dir}',
@@ -227,7 +227,7 @@ def get_object_size(self, object_name: str) -> int:
             return super().get_object_size(object_name)
 
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', RetryDummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', RetryDummyObjectStore):
         with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context):
             remote_uploader_downloader = RemoteUploaderDownloader(
                 bucket_uri=f"s3://{tmp_path}/'object_store_backend",
@@ -263,7 +263,7 @@ def test_race_with_overwrite(tmp_path: pathlib.Path, use_procs: bool, dummy_stat
     # Patching does not work when using multiprocessing with spawn, so we also
     # patch to use fork
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context):
             # Create the object store logger
             remote_uploader_downloader = RemoteUploaderDownloader(
@@ -307,7 +307,7 @@ def test_race_with_overwrite(tmp_path: pathlib.Path, use_procs: bool, dummy_stat
 def test_close_on_failure(tmp_path: pathlib.Path, dummy_state: State):
     """Test that .close() and .post_close() does not hang even when a worker crashes."""
 
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         # Create the object store logger
         remote_uploader_downloader = RemoteUploaderDownloader(
             bucket_uri=f"s3://{tmp_path}/'object_store_backend",
@@ -355,9 +355,9 @@ def test_close_on_failure(tmp_path: pathlib.Path, dummy_state: State):
 
 def test_valid_backend_names():
     valid_backend_names = ['s3', 'libcloud', 'sftp']
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore') as _, \
-         patch('composer.loggers.remote_uploader_downloader.SFTPObjectStore') as _, \
-         patch('composer.loggers.remote_uploader_downloader.LibcloudObjectStore') as _:
+    with patch('composer.utils.object_store.utils.S3ObjectStore') as _, \
+         patch('composer.utils.object_store.utils.SFTPObjectStore') as _, \
+         patch('composer.utils.object_store.utils.LibcloudObjectStore') as _:
         for name in valid_backend_names:
             remote_uploader_downloader = RemoteUploaderDownloader(bucket_uri=f'{name}://not-a-real-bucket')
             # Access the remote_backend property so that it is built
@@ -374,7 +374,7 @@ def test_valid_backend_names():
 def test_exception_queue_works(tmp_path: pathlib.Path, dummy_state: State):
     """Test that exceptions get put on the exception queue and get thrown"""
 
-    with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         # Create the object store logger
         remote_uploader_downloader = RemoteUploaderDownloader(
             bucket_uri=f"s3://{tmp_path}/'object_store_backend",
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 9912563eb8..ede864d13b 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -4,6 +4,7 @@
 import contextlib
 import copy
 import io
+import multiprocessing
 import os
 import pathlib
 import re
@@ -25,12 +26,11 @@
 from composer.algorithms import NoOpModel
 from composer.callbacks import CheckpointSaver
 from composer.core import Callback, Time, TimeUnit
-from composer.loggers import RemoteUploaderDownloader, remote_uploader_downloader
 from composer.metrics import MAP
 from composer.optim import ExponentialScheduler
 from composer.trainer import trainer
 from composer.trainer.trainer import Trainer
-from composer.utils import dist, is_tar, reproducibility
+from composer.utils import dist, is_tar, remote_uploader, reproducibility
 from composer.utils.checkpoint import (
     _COMPOSER_STATES_FILENAME,
     PartialFilePath,
@@ -52,6 +52,7 @@
     device,
 )
 from tests.common.markers import world_size
+from tests.utils.test_remote_uploader import DummyObjectStore
 
 
 class DummyStatefulCallback(Callback):
@@ -309,30 +310,6 @@ def get_trainer(self, **kwargs):
         model = SimpleConvModel()
         return Trainer(model=model, **kwargs)
 
-    @pytest.mark.parametrize('add_remote_ud', [True, False])
-    def test_s3_uri_creates_remote_ud(self, add_remote_ud: bool, monkeypatch: MonkeyPatch):
-        mock_validate_credentials = MagicMock()
-        monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials)
-        if add_remote_ud:
-            with pytest.warns(UserWarning):
-                trainer = self.get_trainer(
-                    save_folder='s3://bucket_name/{run_name}/checkpoints',
-                    loggers=[
-                        RemoteUploaderDownloader('s3://bucket_name', file_path_format_string='{remote_file_name}'),
-                    ],
-                )
-        else:
-            trainer = self.get_trainer(save_folder='s3://bucket_name/{run_name}/checkpoints')
-
-        remote_uds = [
-            logger_dest for logger_dest in trainer.logger.destinations
-            if isinstance(logger_dest, RemoteUploaderDownloader)
-        ]
-        assert len(remote_uds) == 1
-        remote_ud = remote_uds[0]
-        assert remote_ud.remote_backend_name == 's3'
-        assert remote_ud.remote_bucket_name == 'bucket_name'
-
     @pytest.mark.parametrize('uri', ['wandb://foo/bar', 'gcs://foo/bar', 'sftp://foo/bar"'])
     def test_other_uris_error_out(self, uri: str):
         with pytest.raises(NotImplementedError):
@@ -394,7 +371,7 @@ def test_checkpoint_saver_properly_constructed(
         monkeypatch: MonkeyPatch,
     ):
         mock_validate_credentials = MagicMock()
-        monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials)
+        monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials)
 
         trainer = self.get_trainer(save_folder=save_folder)
 
@@ -646,6 +623,71 @@ def test_checkpoint_multiple_callbacks(
         assert id(trainer._checkpoint_saver) == id(checkpoint_savers[0])
         assert len([cb for cb in trainer.state.callbacks if isinstance(cb, CheckpointSaver)]) == len(checkpoint_savers)
 
+    @pytest.mark.parametrize(('upload_success'), [True, False])
+    def test_checkpoint_remote_symlink(
+        self,
+        upload_success: bool,
+    ):
+        import multiprocessing
+        fork_context = multiprocessing.get_context('fork')
+        tmp_dir = tempfile.TemporaryDirectory()
+
+        def _get_tmp_dir(self):
+            return tmp_dir
+
+        class _AlwaysFailDummyObjectStore(DummyObjectStore):
+
+            def upload_object(self, object_name, filename, callback=None):
+                # Only allows to upload symlink to simulate
+                # the situation that checkpoint file uploading fails
+                if 'symlink' in object_name or 'credentials_validated_successfully' in object_name:
+                    return super().upload_object(object_name, filename, callback)
+                raise RuntimeError('Raise Error intentionally')
+
+        if upload_success:
+            MockObjectStore = DummyObjectStore
+        else:
+            MockObjectStore = _AlwaysFailDummyObjectStore
+
+        with patch('composer.utils.object_store.utils.S3ObjectStore', MockObjectStore):
+            with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir):
+                with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
+                    train_dataset = RandomClassificationDataset(size=10)
+                    train_dataloader = DataLoader(
+                        dataset=train_dataset,
+                        batch_size=2,
+                        sampler=dist.get_sampler(train_dataset),
+                    )
+
+                    trainer = Trainer(
+                        model=SimpleModel(),
+                        train_dataloader=train_dataloader,
+                        save_interval='1ba',
+                        max_duration='1ba',
+                        save_folder='S3://whatever/',
+                    )
+                    symlink_filepath = os.path.join(tmp_dir.name, 'latest-rank0.pt.symlink')
+                    if upload_success:
+                        trainer.fit()
+                        with open(symlink_filepath, 'r') as f:
+                            assert f.read() == 'ep0-ba1-rank0.pt'
+                    else:
+                        assert trainer._checkpoint_saver is not None
+                        trainer._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01
+                        trainer._checkpoint_saver.upload_timeout_in_seconds = 1
+                        with pytest.raises(RuntimeError, match='Raise Error intentionally'):
+                            trainer.fit()
+                        assert os.path.exists(symlink_filepath) == False
+
+                        def post_close(self):
+                            return
+
+                        assert trainer._checkpoint_saver is not None
+                        trainer._checkpoint_saver.post_close = post_close.__get__(
+                            trainer._checkpoint_saver,
+                            CheckpointSaver,
+                        )
+
 
 class TestCheckpointLoading:
 
@@ -709,25 +751,6 @@ def get_trainer(
             **kwargs,
         )
 
-    def get_logger(self, tmp_path: pathlib.Path):
-        """Returns an object store logger that saves locally."""
-        remote_dir = str(tmp_path / 'object_store')
-        os.makedirs(remote_dir, exist_ok=True)
-
-        return RemoteUploaderDownloader(
-            bucket_uri='libcloud://.',
-            backend_kwargs={
-                'provider': 'local',
-                'container': '.',
-                'provider_kwargs': {
-                    'key': remote_dir,
-                },
-            },
-            num_concurrent_uploads=1,
-            use_procs=False,
-            upload_staging_folder=str(tmp_path / 'staging_folder'),
-        )
-
     @world_size(1, 2)
     @device('cpu', 'gpu')
     @pytest.mark.parametrize('use_object_store', [True, False])
@@ -758,9 +781,6 @@ def test_autoresume(
         if delete_local and not use_object_store:
             pytest.skip('Invalid test setting.')
 
-        if use_object_store:
-            pytest.importorskip('libcloud')
-
         latest_filename = 'latest-rank{rank}' + file_extension
         if test_slashed:
             latest_filename = 'testdir/' + latest_filename
@@ -768,51 +788,68 @@ def test_autoresume(
         if is_compressed_pt(latest_filename) and not get_compressor(latest_filename).exists:
             pytest.skip(reason=f'compressor not found for {latest_filename}')
 
-        trainer_1 = self.get_trainer(
-            latest_filename=latest_filename,
-            file_extension=file_extension,
-            save_folder='first',
-            device=device,
-            run_name='big-chungus',
-            autoresume=True,
-            loggers=[self.get_logger(tmp_path)] if use_object_store else [],
-            save_metrics=save_metrics,
-        )
-
-        # trains the model, saving the checkpoint files
-        trainer_1.fit()
-        trainer_1.close()
-
-        if delete_local:
-            # delete files locally, forcing trainer to look in object store
-            shutil.rmtree('first')
-
-        trainer_2 = self.get_trainer(
-            latest_filename=latest_filename,
-            save_folder='first',
-            device=device,
-            run_name='big-chungus',
-            autoresume=True,
-            load_path='ignore_me.pt',  # this should be ignored
-            load_ignore_keys=['*'],  # this should be ignored
-            save_overwrite=save_overwrite,
-            loggers=[self.get_logger(tmp_path)] if use_object_store else [],
-        )
-
-        self._assert_weights_equivalent(
-            trainer_1.state.model,
-            trainer_2.state.model,
-        )
-
-        if save_metrics:
-            assert self._metrics_equal(
-                trainer_1.state.train_metrics,
-                trainer_2.state.train_metrics,
-                trainer_1.state.eval_metrics,
-                trainer_2.state.eval_metrics,
-            ), 'Original metrics do not equal metrics from loaded checkpoint.'
-
-        assert trainer_1.state.run_name == trainer_2.state.run_name
+        if use_object_store:
+            save_folder = 's3://bucket_name/first'
+        else:
+            save_folder = 'first'
+
+        # Mock S3 object store
+        fork_context = multiprocessing.get_context('fork')
+        tmp_dir = tempfile.TemporaryDirectory()
+
+        def _get_tmp_dir(self):
+            return tmp_dir
+
+        with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
+            with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir):
+                with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
+
+                    trainer_1 = self.get_trainer(
+                        latest_filename=latest_filename,
+                        file_extension=file_extension,
+                        save_folder=save_folder,
+                        device=device,
+                        run_name='big-chungus',
+                        autoresume=True,
+                        save_metrics=save_metrics,
+                    )
+                    if use_object_store:
+                        assert trainer_1._checkpoint_saver is not None
+                        trainer_1._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01
+
+                    # trains the model, saving the checkpoint files
+                    trainer_1.fit()
+                    trainer_1.close()
+
+                    if delete_local:
+                        # delete files locally, forcing trainer to look in object store
+                        shutil.rmtree('first')
+
+                    trainer_2 = self.get_trainer(
+                        latest_filename=latest_filename,
+                        save_folder=save_folder,
+                        device=device,
+                        run_name='big-chungus',
+                        autoresume=True,
+                        load_path='ignore_me.pt',  # this should be ignored
+                        load_ignore_keys=['*'],  # this should be ignored
+                        save_overwrite=save_overwrite,
+                    )
+
+                    self._assert_weights_equivalent(
+                        trainer_1.state.model,
+                        trainer_2.state.model,
+                    )
+
+                    if save_metrics:
+                        assert self._metrics_equal(
+                            trainer_1.state.train_metrics,
+                            trainer_2.state.train_metrics,
+                            trainer_1.state.eval_metrics,
+                            trainer_2.state.eval_metrics,
+                        ), 'Original metrics do not equal metrics from loaded checkpoint.'
+
+                    assert trainer_1.state.run_name == trainer_2.state.run_name
 
     @pytest.mark.parametrize(('save_folder'), [None, 'first'])
     def test_autoresume_from_callback(
@@ -862,7 +899,7 @@ def test_autoresume_from_callback(
     def test_load_from_uri(self, load_path: str, load_object_store: Optional[ObjectStore], monkeypatch: MonkeyPatch):
 
         mock_validate_credentials = MagicMock()
-        monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials)
+        monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials)
         mock_load_checkpoint = MagicMock()
         monkeypatch.setattr(trainer.checkpoint, 'load_checkpoint', mock_load_checkpoint)
         self.get_trainer(load_path=load_path, load_object_store=load_object_store)
@@ -882,7 +919,7 @@ def test_load_from_uri(self, load_path: str, load_object_store: Optional[ObjectS
     )
     def test_other_backends_error(self, load_path: str, monkeypatch: MonkeyPatch):
         mock_validate_credentials = MagicMock()
-        monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials)
+        monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials)
         with pytest.raises(NotImplementedError):
             self.get_trainer(load_path=load_path)
 
@@ -1197,29 +1234,37 @@ def _stateful_callbacks_equal(self, callbacks1, callbacks2):
         return cb1.random_value == cb2.random_value
 
     def test_load_weights_object_store(self, tmp_path):
-
-        pytest.importorskip('libcloud')
-
-        trainer_1 = self.get_trainer(
-            save_folder='{run_name}/checkpoints',
-            loggers=[self.get_logger(tmp_path)],
-            run_name='electric-zebra',
-        )
-        trainer_1.fit()
-        trainer_1.close()
-
-        trainer_2 = self.get_trainer(
-            loggers=[self.get_logger(tmp_path)],
-            run_name='electric-zebra',
-            load_path='electric-zebra/checkpoints/latest-rank0.pt',
-            load_object_store=self.get_logger(tmp_path),
-        )
-
-        # check weights loaded properly
-        self._assert_weights_equivalent(
-            trainer_1.state.model,
-            trainer_2.state.model,
-        )
+        # Mock S3 object store
+        fork_context = multiprocessing.get_context('fork')
+        tmp_dir = tempfile.TemporaryDirectory()
+
+        def _get_tmp_dir(self):
+            return tmp_dir
+
+        with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
+            with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir):
+                with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
+                    save_folder = 's3://my_bucket/{run_name}/checkpoints'
+                    trainer_1 = self.get_trainer(
+                        save_folder=save_folder,
+                        run_name='electric-zebra',
+                    )
+                    assert trainer_1._checkpoint_saver is not None
+                    trainer_1._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01
+                    trainer_1.fit()
+                    trainer_1.close()
+
+                    trainer_2 = self.get_trainer(
+                        run_name='electric-zebra',
+                        load_path='electric-zebra/checkpoints/latest-rank0.pt',
+                        load_object_store=DummyObjectStore(),
+                    )
+
+                    # check weights loaded properly
+                    self._assert_weights_equivalent(
+                        trainer_1.state.model,
+                        trainer_2.state.model,
+                    )
 
     @pytest.mark.parametrize(
         'run_name,save_folder,latest_filename',
diff --git a/tests/utils/test_remote_uploader.py b/tests/utils/test_remote_uploader.py
index 847abb369d..2e41e91d18 100644
--- a/tests/utils/test_remote_uploader.py
+++ b/tests/utils/test_remote_uploader.py
@@ -20,7 +20,7 @@ class DummyObjectStore(ObjectStore):
     """Dummy ObjectStore implementation that is backed by a local directory."""
 
     def __init__(self, **kwargs: Dict[str, Any]) -> None:
-        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.tmp_dir = self.get_tmp_dir()
         self.root = self.tmp_dir.name
         self.sleep_sec = 0
         self.dest_filename = ''
@@ -28,6 +28,9 @@ def __init__(self, **kwargs: Dict[str, Any]) -> None:
     def raise_error(self):
         return False
 
+    def get_tmp_dir(self):
+        return tempfile.TemporaryDirectory()
+
     def upload_object(
         self,
         object_name: str,
@@ -38,6 +41,7 @@ def upload_object(
             raise RuntimeError('Raise Error intentionally')
         time.sleep(self.sleep_sec)
         dest_filename = pathlib.Path(self.root) / object_name
+        os.makedirs(os.path.dirname(dest_filename), exist_ok=True)
         shutil.copy2(filename, dest_filename)
         self.dest_filename = dest_filename
 
@@ -46,6 +50,16 @@ def get_object_size(self, object_name: str) -> int:
         size = os.stat(object_path).st_size
         return size
 
+    def download_object(
+        self,
+        object_name: str,
+        filename: Union[str, pathlib.Path],
+        overwrite: bool = False,
+        callback: Optional[Callable[[int, int], None]] = None,
+    ):
+        object_path = pathlib.Path(self.root) / object_name
+        shutil.copy2(object_path, filename)
+
 
 def test_upload_mutliple_files():
     fork_context = multiprocessing.get_context('fork')
@@ -54,7 +68,7 @@ def test_upload_mutliple_files():
     def _get_tmp_dir():
         return tmp_dir
 
-    with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('tempfile.TemporaryDirectory', _get_tmp_dir):
             with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
                 remote_uploader = RemoteUploader(
@@ -99,7 +113,7 @@ def _get_tmp_dir():
         return remote_tmp_dir
 
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('tempfile.TemporaryDirectory', _get_tmp_dir):
             with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
                 remote_uploader = RemoteUploader(remote_folder='S3://whatever/path',)
@@ -145,7 +159,7 @@ def raise_error(self):
             return True
 
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.utils.file_helpers.S3ObjectStore', AlwaysFailDummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', AlwaysFailDummyObjectStore):
         with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
             remote_uploader = RemoteUploader(remote_folder='S3://whatever/path',)
             tmp_dir = tempfile.TemporaryDirectory()
@@ -168,7 +182,7 @@ def raise_error(self):
 
 def test_wait():
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
             remote_uploader = RemoteUploader(
                 remote_folder='S3://whatever/path',
@@ -197,7 +211,7 @@ def test_wait():
 
 def test_wait_and_close():
     fork_context = multiprocessing.get_context('fork')
-    with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore):
+    with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore):
         with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context):
             remote_uploader = RemoteUploader(
                 remote_folder='S3://whatever/path',

From e951f0a81ed65ea3f607cf437b6387de4c7fc632 Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <saaketh@mosaicml.com>
Date: Mon, 8 Jul 2024 16:19:32 -0700
Subject: [PATCH 55/69] Correctly process `parallelism_config['tp']` when it's
 a dict (#3434)

* big fix

* testing

* ignore

* ignore

* ignore

* Update test_fsdp_checkpoint.py

* Update test_fsdp_checkpoint.py

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/trainer/trainer.py           |  2 +-
 tests/trainer/test_fsdp_checkpoint.py | 17 +++++++++++++++--
 tests/trainer/test_tp.py              |  1 +
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index c752187ba6..b410e8aa96 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1231,7 +1231,7 @@ def __init__(
                 if isinstance(parallelism_config['tp'], TPConfig):
                     parallelism_config_args['tp'] = parallelism_config['tp']
                 else:
-                    parallelism_config['tp'] = TPConfig(**parallelism_config['tp'])
+                    parallelism_config_args['tp'] = TPConfig(**parallelism_config['tp'])
             parallelism_config = ParallelismConfig(
                 **parallelism_config_args,
             ) if len(parallelism_config_args) > 0 else None
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index 154ed6b282..a59e60172a 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -30,7 +30,7 @@
 from composer.models import ComposerClassifier
 from composer.optim import DecoupledAdamW
 from composer.trainer import Trainer
-from composer.utils import FSDPConfig, dist, parse_uri
+from composer.utils import FSDPConfig, TPConfig, dist, parse_uri
 from composer.utils.checkpoint import is_checkpoint_legacy_sharded
 from composer.utils.file_helpers import get_file
 from composer.utils.object_store import S3ObjectStore
@@ -288,6 +288,7 @@ def _compare_timestamps_between_state_dicts(state_dict1, state_dict2):
 
 @pytest.mark.gpu
 @pytest.mark.filterwarnings(r'ignore:.*scatter_full_optim_state_dict``is being deprecated.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning')
 @pytest.mark.parametrize(
     'optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp,use_hsdp',
     [
@@ -315,7 +316,9 @@ def test_fsdp_full_state_dict_load(
     use_hsdp: bool,
 ):
     if use_hsdp:
-        pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch')
+        pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch')
+    if use_tp:
+        pytest.skip('TP on PyTorch 2.3 has full state dict issues.')
     if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
         pytest.skip('HSDP and TP require torch 2.3.0 or later')
     if autoresume:
@@ -360,6 +363,11 @@ def test_fsdp_full_state_dict_load(
         fsdp_config=fsdp_config,
         tp_config=tp_config,
     )
+
+    if use_tp:
+        assert trainer1.state.tp_config is not None
+        assert isinstance(trainer1.state.tp_config, TPConfig)
+
     trainer1.fit()
     state_dict_from_trainer1 = trainer1.state.state_dict()
     trainer1.close()
@@ -511,6 +519,7 @@ def test_fsdp_mixed_with_sync(
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*The CUDA RNG state could not be loaded.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*ShardedTensor.to only move tensor to its current device.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning')
 def test_fsdp_load_old_checkpoint(
     world_size,
     tmp_path: pathlib.Path,
@@ -748,6 +757,7 @@ def test_fsdp_full_state_dict_load_with_ema(
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning')
 def test_checkpoint_loading_with_validation(world_size, tmp_path, is_valid_checkpoint: bool, state_dict_type: str):
     # Set the error expectations.
     expectation = does_not_raise()
@@ -818,6 +828,7 @@ def mock_get_checkpoint_validation_function():
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning')
 def test_fsdp_partitioned_state_dict_load(
     tmp_path: pathlib.Path,
     autoresume: bool,
@@ -833,6 +844,8 @@ def test_fsdp_partitioned_state_dict_load(
     s3_ephemeral_prefix,
     request,
 ):
+    if use_tp:
+        pytest.skip('TP on PyTorch 2.3 has sharded state dict issues.')
     if weights_only and autoresume:
         pytest.skip('Weights only with autoresume is not supported')
     if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
diff --git a/tests/trainer/test_tp.py b/tests/trainer/test_tp.py
index 8146ebad40..bfee2e13c9 100644
--- a/tests/trainer/test_tp.py
+++ b/tests/trainer/test_tp.py
@@ -18,6 +18,7 @@
 @pytest.mark.gpu
 @world_size(4)
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.3'), reason='requires PyTorch 2.3+')
+@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning')
 def test_tp_train(world_size: int):
     from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
 

From 6dec8359374dc81b1569e21744c0942ff91f0e72 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Mon, 8 Jul 2024 22:11:22 -0700
Subject: [PATCH 56/69] [checkpoint v2] Download api (#3447)

* a

* a

* fix lint and test

* lint

* comments

* comment
---
 composer/checkpoint/__init__.py     |  2 +
 composer/checkpoint/download.py     | 85 +++++++++++++++++++++++++++++
 composer/utils/__init__.py          |  2 +
 composer/utils/file_helpers.py      | 11 ++++
 tests/checkpoint/test_download.py   | 56 +++++++++++++++++++
 tests/utils/test_remote_uploader.py |  2 +
 6 files changed, 158 insertions(+)
 create mode 100644 composer/checkpoint/download.py
 create mode 100644 tests/checkpoint/test_download.py

diff --git a/composer/checkpoint/__init__.py b/composer/checkpoint/__init__.py
index d4b21c790d..33162fc5e6 100644
--- a/composer/checkpoint/__init__.py
+++ b/composer/checkpoint/__init__.py
@@ -3,6 +3,7 @@
 
 """Module for checkpointing API."""
 
+from composer.checkpoint.download import download_monolithic_checkpoint
 from composer.checkpoint.state_dict import (
     get_metadata_state_dict,
     get_model_state_dict,
@@ -15,4 +16,5 @@
     'get_optim_state_dict',
     'get_metadata_state_dict',
     'get_resumption_state_dict',
+    'download_monolithic_checkpoint',
 ]
diff --git a/composer/checkpoint/download.py b/composer/checkpoint/download.py
new file mode 100644
index 0000000000..01a80beb5f
--- /dev/null
+++ b/composer/checkpoint/download.py
@@ -0,0 +1,85 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Useful functions for load checkpoints from remote object store or local disk."""
+
+import logging
+from typing import Optional
+
+from composer.utils import (
+    dist,
+    extract_path_from_symlink,
+    maybe_create_object_store_from_uri,
+    parse_uri,
+    retry,
+)
+
+log = logging.getLogger(__name__)
+
+
+def download_file(
+    source_uri: str,
+    destination_path: str,
+    node_ranks: Optional[list[int]] = None,
+    num_attempts: int = 5,
+):
+    """Downloads a file (object) from the specified URI to the specified directory.
+
+    Args:
+        source_uri (str): The URI to download the file from or a symlink to the URI.
+        destination_path (str): The directory to download the file to.
+        node_ranks (list[int]): The ranks of the nodes that will download the file. If None, all nodes will download the file.
+        num_attempts (int): Retry for object store downloads. Default to 5.
+    """
+    # Only local rank 0 downloads
+    local_rank = dist.get_local_rank()
+    if local_rank != 0:
+        return
+
+    node_rank = dist.get_node_rank()
+    if node_ranks is not None and node_rank not in node_ranks:
+        return
+
+    object_store = maybe_create_object_store_from_uri(source_uri)
+    _, _, source_path = parse_uri(source_uri)
+    if source_uri.endswith('.symlink'):
+        source_path = extract_path_from_symlink(source_path, object_store)
+    assert object_store is not None
+
+    @retry(num_attempts=num_attempts)
+    def _download():
+        object_store.download_object(
+            object_name=source_path,
+            filename=destination_path,
+        )
+
+    log.debug(f'Downloading {source_path} to {destination_path}')
+    _download()
+    log.debug(f'Finished downloading {source_path} to {destination_path}')
+
+
+def download_monolithic_checkpoint(
+    source_uri: str,
+    destination_path: str,
+    global_rank_zero_only: bool = True,
+):
+    """Downloads a monolithic checkpoint from the specified URI to the specified directory.
+
+    Args:
+        source_uri (str): The URI to download the checkpoint from or symlink that points to the URI.
+        destination_path (str): The directory to download the checkpoint to.
+        global_rank_zero_only (bool): If True, only rank 0 will download the checkpoint.
+        broadcast_files_to_other_nodes (bool): If True, the downloaded checkpoint will be broadcast to all other nodes.
+            If torch syncs modules states this is unnecessary.
+    """
+    node_ranks = None
+    if global_rank_zero_only:
+        node_ranks = [0]
+    download_file(
+        source_uri=source_uri,
+        destination_path=destination_path,
+        node_ranks=node_ranks,
+    )
+    if not global_rank_zero_only or (global_rank_zero_only and dist.get_global_rank() == 0):
+        return destination_path
+    return None
diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py
index 0850fd2bdd..20fa44e092 100644
--- a/composer/utils/__init__.py
+++ b/composer/utils/__init__.py
@@ -37,6 +37,7 @@
     create_symlink_file,
     ensure_folder_has_no_conflicting_files,
     ensure_folder_is_empty,
+    extract_path_from_symlink,
     format_name_with_dist,
     format_name_with_dist_and_time,
     get_file,
@@ -158,6 +159,7 @@
     'ParallelismConfig',
     'MLFLOW_EXPERIMENT_ID_FORMAT_KEY',
     'MLFLOW_RUN_ID_FORMAT_KEY',
+    'extract_path_from_symlink',
     'RemoteUploader',
     'validate_credentials',
     'build_remote_backend',
diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py
index 11d10328ea..4f458b0a8e 100644
--- a/composer/utils/file_helpers.py
+++ b/composer/utils/file_helpers.py
@@ -49,6 +49,7 @@
     'maybe_create_object_store_from_uri',
     'maybe_create_remote_uploader_downloader_from_uri',
     'parse_uri',
+    'extract_path_from_symlink',
     'validate_credentials',
 ]
 
@@ -57,6 +58,16 @@ def extract_path_from_symlink(
     source_path: str,
     object_store: Optional[Union[LoggerDestination, ObjectStore]] = None,
 ) -> str:
+    """Returns the checkpont path from symlink file.
+
+    Args:
+        source_path(str): The remote symlink path.
+        object_store(LoggerDestination | ObjectStore, optional): The object store
+            used to download the remote symlink file
+
+    Returns:
+        str: The content of the remote symlink file.
+    """
     if object_store is not None:
         with tempfile.TemporaryDirectory() as tmpdir:
             _, _, source_path = parse_uri(source_path)
diff --git a/tests/checkpoint/test_download.py b/tests/checkpoint/test_download.py
new file mode 100644
index 0000000000..98c937bac4
--- /dev/null
+++ b/tests/checkpoint/test_download.py
@@ -0,0 +1,56 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from composer.checkpoint import download_monolithic_checkpoint
+from composer.utils import dist
+from tests.checkpoint.helpers import init_model
+from tests.common.markers import world_size
+from tests.utils.test_remote_uploader import DummyObjectStore
+
+
+@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.parametrize('rank_zero_only', [True, False])
+def test_download_monolithic_checkpoint(world_size: int, rank_zero_only: bool):
+    # Write a checkpoint
+    tmp_dir = tempfile.TemporaryDirectory()
+    use_fsdp = False
+    if world_size > 1:
+        use_fsdp = True
+    fsdp_model, _ = init_model(use_fsdp=use_fsdp)
+
+    from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
+    state = get_model_state_dict(fsdp_model, options=StateDictOptions(full_state_dict=True))
+
+    checkpoint_filename = 'state_dict'
+    save_filename = os.path.join(tmp_dir.name, checkpoint_filename)
+    if dist.get_global_rank() == 0:
+        torch.save(state, save_filename)
+
+    class DummyS3ObjectStore(DummyObjectStore):
+
+        def get_tmp_dir(self):
+            return tmp_dir
+
+    # Download a monolithic checkpoint
+    local_file_name = 'state_dict.download'
+    with patch('composer.utils.file_helpers.S3ObjectStore', DummyS3ObjectStore):
+        ret = download_monolithic_checkpoint(
+            source_uri=f's3://bucket_name/{checkpoint_filename}',
+            destination_path=local_file_name,
+            global_rank_zero_only=rank_zero_only,
+        )
+    dist.barrier()
+
+    if rank_zero_only and dist.get_global_rank() != 0:
+        assert ret == None
+    if dist.get_global_rank() == 0:
+        assert ret == local_file_name
+        assert os.path.isfile(local_file_name) == True
diff --git a/tests/utils/test_remote_uploader.py b/tests/utils/test_remote_uploader.py
index 2e41e91d18..100e64ecf0 100644
--- a/tests/utils/test_remote_uploader.py
+++ b/tests/utils/test_remote_uploader.py
@@ -57,6 +57,8 @@ def download_object(
         overwrite: bool = False,
         callback: Optional[Callable[[int, int], None]] = None,
     ):
+        if overwrite is False and os.path.isfile(filename):
+            raise FileExistsError(f'The file at {filename} already exists and overwrite is set to False.')
         object_path = pathlib.Path(self.root) / object_name
         shutil.copy2(object_path, filename)
 

From 18795f14ebc19f668ddabce2059382b6b516ce93 Mon Sep 17 00:00:00 2001
From: Jane Zhang <jane.zhang@databricks.com>
Date: Tue, 9 Jul 2024 12:45:04 -0700
Subject: [PATCH 57/69] removed exception from logger (#3464)

---
 composer/loggers/mosaicml_logger.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py
index 8b4ff5942a..d7c83b85fa 100644
--- a/composer/loggers/mosaicml_logger.py
+++ b/composer/loggers/mosaicml_logger.py
@@ -97,10 +97,6 @@ def log_hyperparameters(self, hyperparameters: dict[str, Any]):
     def log_metrics(self, metrics: dict[str, Any], step: Optional[int] = None) -> None:
         self.log_metadata(metrics)
 
-    def log_exception(self, exception: Exception):
-        self.log_metadata({'exception': exception_to_json_serializable_dict(exception)})
-        self._flush_metadata(force_flush=True)
-
     def after_load(self, state: State, logger: Logger) -> None:
         # Log model data downloaded and initialized for run events
         log.debug(f'Logging model initialized time to metadata')

From 11bad573d28b4f8a362257fc81598577553bbed5 Mon Sep 17 00:00:00 2001
From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com>
Date: Thu, 11 Jul 2024 12:19:05 -0700
Subject: [PATCH 58/69] fixed docs for mfu (#3469)

---
 composer/callbacks/speed_monitor.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/composer/callbacks/speed_monitor.py b/composer/callbacks/speed_monitor.py
index 2098036297..6320e06562 100644
--- a/composer/callbacks/speed_monitor.py
+++ b/composer/callbacks/speed_monitor.py
@@ -223,10 +223,11 @@ class SpeedMonitor(Callback):
     | `throughput/device/flops_per_sec`   | logged when model has attribute `flops_per_batch`         |
     |                                     |                                                           |
     +-------------------------------------+-----------------------------------------------------------+
-    |                                     | `throughput/device/flops_per_sec` divided by world size.  |
-    | `throughput/device/mfu`             | Only logged when model has attribute `flops_per_batch`    |
-    |                                     | and `gpu_flops_available`, which can be passed as an      |
-    |                                     | argument if not automatically determined by SpeedMonitor  |
+    |                                     | `throughput/device/flops_per_sec` divided by flops        |
+    |                                     | available on the GPU device. Only logged when model has   |
+    | `throughput/device/mfu`             | attribute `flops_per_batch` and `gpu_flops_available`,    |
+    |                                     | which can be passed as an argument if not automatically   |
+    |                                     | determined by SpeedMonitor                                |
     +-------------------------------------+-----------------------------------------------------------+
     | `time/train`                        | Total elapsed training time                               |
     +-------------------------------------+-----------------------------------------------------------+

From 74c7d3bed8dbab110296dc5827dffb8de023a576 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 11 Jul 2024 12:51:37 -0700
Subject: [PATCH 59/69] add comment (#3470)

---
 composer/trainer/trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index b410e8aa96..501bc4bf09 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1652,6 +1652,7 @@ def __init__(
 
         # TP wrap
         if self.state.tp_config is not None:
+            # Init with globally fixed seed so all HSDP replicas have the same initial weights
             with reproducibility.seed_context(self.state.rank_zero_seed):
                 prepare_tp_module(
                     model,
@@ -1660,6 +1661,7 @@ def __init__(
 
         # FSDP wrap if not using monolith checkpoint on rank 0 only
         if self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and not self.state.load_monolith_rank0_only:
+            # Init with globally fixed seed so all HSDP replicas have the same initial weights
             with reproducibility.seed_context(self.state.rank_zero_seed):
                 prepare_fsdp_module(
                     model,
@@ -1829,6 +1831,7 @@ def __init__(
             not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and
             self.state.load_monolith_rank0_only
         ):
+            # Init with globally fixed seed so all HSDP replicas have the same initial weights
             with reproducibility.seed_context(self.state.rank_zero_seed):
                 prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching)
 

From 14bc187d82c9509c52bbf65d2614e6141e4701d1 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Thu, 11 Jul 2024 13:28:47 -0700
Subject: [PATCH 60/69] Change pytorch eval for FP8 to default to fall back to
 BF16 (#3454)

---
 composer/core/precision.py         |  4 +++-
 composer/trainer/trainer.py        | 26 +++++++++++++++++++++-----
 tests/fixtures/fixtures.py         |  3 ++-
 tests/trainer/test_trainer_eval.py | 30 ++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/composer/core/precision.py b/composer/core/precision.py
index ea08a10c56..bb91fc64d1 100644
--- a/composer/core/precision.py
+++ b/composer/core/precision.py
@@ -40,6 +40,7 @@ class Precision(StringEnum):
 def get_precision_context(
     precision: Union[str, Precision],
     precision_config: Optional[dict[str, Any]] = None,
+    fp8_autocast_enabled: bool = True,
 ) -> Generator[None, None, None]:
     """Returns a context manager to automatically cast to a specific precision.
 
@@ -47,6 +48,7 @@ def get_precision_context(
         precision (str | Precision): Precision for the context
         precision_config (Optional[dict[str, Any]]): Config for FP8 scaling strategy. See parameters for
             `DelayedScaling <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html?highlight=delayedscaling#transformer_engine.common.recipe.DelayedScaling>`_.
+        fp8_autocast_enabled (bool): Whether to enable FP8 autocast. Defaults to True.
     """
     precision = Precision(precision)
     if precision == Precision.FP32:
@@ -86,7 +88,7 @@ def get_precision_context(
                     'amax_compute_algo': 'max',
                 }
             fp8_recipe = DelayedScaling(**precision_config)
-            with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+            with te.fp8_autocast(enabled=fp8_autocast_enabled, fp8_recipe=fp8_recipe):
                 # The te.onnx_export flag ensures that we save all fp8 buffers
                 # as tensors instead of bytes. This is necessary for proper
                 # saving and resumption of checkpoints.
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 501bc4bf09..b62b3d3e58 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -460,10 +460,15 @@ def _get_ddp_sync_strategy(ddp_sync_strategy: Optional[Union[str, DDPSyncStrateg
     return ddp_sync_strategy
 
 
-def _get_precision_context(precision: Precision, precision_config: Optional[dict[str, Any]], deepspeed_enabled: bool):
+def _get_precision_context(
+    precision: Precision,
+    precision_config: Optional[dict[str, Any]],
+    deepspeed_enabled: bool,
+    fp8_autocast_enabled: bool = True,
+):
     if deepspeed_enabled:
         return contextlib.nullcontext()
-    return get_precision_context(precision, precision_config)
+    return get_precision_context(precision, precision_config, fp8_autocast_enabled)
 
 
 def _generate_run_name() -> str:
@@ -2675,10 +2680,15 @@ def _train_loop(self) -> None:
     def _eval_train_metrics(self, device_batch):
         assert self._train_data_spec is not None, 'The train data spec should be set on __init__ or fit()'
         assert self.state.train_metrics is not None, 'The train metrics should be set on __init__ or fit()'
-
+        # We disable FP8 autocast in eval metrics and default to the activation dtype for the forward pass
+        # This is because FP8 in TE requires all eval data sizes to be divisible by 16 which does not hold for all evaluation datasets.
+        # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more info.
+        # Note: the activation dtype is BF16 if FSDP Mixed Precision PURE is enabled and FP32 if FSDP Mixed Precision FULL is enabled.
+        # See https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/linear.py#L250-L252 and \
+        # https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/base.py#L495-L513 for more info.
         with torch.no_grad(),\
                 model_eval_mode(self.state.model),\
-                _get_precision_context(self.state.precision, self.state.precision_config, self.state.deepspeed_enabled):
+                _get_precision_context(self.state.precision, self.state.precision_config, self.state.deepspeed_enabled, fp8_autocast_enabled=False):
             eval_outputs = self._original_model.eval_forward(device_batch, self.state.outputs)
             for metric in self.state.train_metrics.values():
                 self._original_model.update_metric(
@@ -3473,11 +3483,17 @@ def _eval_loop(
                                         )[0]
 
                             self.engine.run_event(Event.EVAL_BEFORE_FORWARD)
-
+                            # We disable FP8 autocast in eval mode and default to the activation dtype for the forward pass
+                            # This is because FP8 in TE requires all eval data sizes to be divisible by 16 which does not hold for all evaluation datasets.
+                            # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more info.
+                            # Note: the activation dtype is BF16 if FSDP Mixed Precision PURE is enabled and FP32 if FSDP Mixed Precision FULL is enabled.
+                            # See https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/linear.py#L250-L252 and \
+                            # https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/base.py#L495-L513 for more info.
                             with _get_precision_context(
                                 self.state.precision,
                                 self.state.precision_config,
                                 self.state.deepspeed_enabled,
+                                fp8_autocast_enabled=False,
                             ):
                                 self.state.outputs = self._original_model.eval_forward(self.state.batch)
 
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
index f587079073..c4dd3fa65f 100644
--- a/tests/fixtures/fixtures.py
+++ b/tests/fixtures/fixtures.py
@@ -14,7 +14,7 @@
 from composer.core import State
 from composer.devices import DeviceCPU, DeviceGPU
 from composer.loggers import Logger
-from composer.utils import dist
+from composer.utils import dist, retry
 from tests.common import RandomClassificationDataset, SimpleModel
 from tests.conftest import _get_option
 
@@ -310,6 +310,7 @@ def _session_tiny_t5_config():  # type: ignore
     return tiny_t5_config_helper()
 
 
+@retry(num_attempts=3)
 def tiny_t5_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
diff --git a/tests/trainer/test_trainer_eval.py b/tests/trainer/test_trainer_eval.py
index b548efde81..9a2d8d6ab4 100644
--- a/tests/trainer/test_trainer_eval.py
+++ b/tests/trainer/test_trainer_eval.py
@@ -92,6 +92,36 @@ def test_eval_with_nondivisible_dataset(world_size: int, size: int, batch_size:
     assert count.item() == size
 
 
+from unittest.mock import patch
+
+
+@pytest.mark.gpu
+def test_amp_fp8_eval_casts_to_bf16():
+    # Check that we can import FP8 with TE. If not, skip this test.
+    try:
+        import transformer_engine  # pyright: ignore
+    except ImportError:
+        pytest.skip('Precision amp_fp8 requires transformer-engine to be installed',)
+
+    # Mocking the transformer_engine.pytorch.fp8_autocast and running model eval.
+    with patch('transformer_engine.pytorch.fp8_autocast') as mock_fp8_autocast:
+        # Construct the trainer
+        trainer = Trainer(model=SimpleModel(), device='gpu', precision='amp_fp8')
+        # Evaluate the model
+        dataset = RandomClassificationDataset()
+        trainer.eval(eval_dataloader=DataLoader(
+            dataset=dataset,
+            batch_size=10,
+            sampler=dist.get_sampler(dataset),
+        ),)
+
+    # Check that te.fp8_autocast was called with enabled=False.
+    # This ensures that we disable the FP8 context on eval.
+    actual_call = mock_fp8_autocast.call_args_list[0]
+    actual_call_args = actual_call._get_call_arguments()[1]
+    assert actual_call_args['enabled'] is False
+
+
 def test_eval_call_with_trainer_evaluators():
     trainer_dataset = RandomClassificationDataset()
     trainer_evaluator = Evaluator(

From a5dc1555da1a1e9c7c4b707d2a66e8c244d614c6 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:27:38 -0400
Subject: [PATCH 61/69] Fix checkpoint events (#3468)

---
 composer/callbacks/checkpoint_saver.py         | 14 +++++++++++---
 composer/loggers/remote_uploader_downloader.py |  8 +++-----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
index 29468e66c3..661b3046ba 100644
--- a/composer/callbacks/checkpoint_saver.py
+++ b/composer/callbacks/checkpoint_saver.py
@@ -11,11 +11,12 @@
 import shutil
 import tempfile
 import textwrap
+import time
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
 from composer.core import Callback, Event, State, Time, Timestamp
-from composer.loggers import Logger, MLFlowLogger
+from composer.loggers import Logger, MLFlowLogger, MosaicMLLogger
 from composer.utils import (
     FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
     FORMAT_NAME_WITH_DIST_TABLE,
@@ -619,8 +620,13 @@ def _rotate_checkpoints(self, sharding_enabled: bool = False):
                 if dist.get_global_rank() == 0:
                     shutil.rmtree(prefix_dir)
 
+    def _log_checkpoint_upload(self, logger: Logger):
+        for destination in logger.destinations:
+            if isinstance(destination, MosaicMLLogger):
+                destination.log_metadata({'checkpoint_uploaded_time': time.time()}, force_flush=True)
+
     def batch_end(self, state: State, logger: Logger) -> None:
-        del state, logger  # unused
+        del state  # unused
         if self.remote_uploader is None:
             return
         self.remote_uploader.check_workers()
@@ -643,13 +649,14 @@ def batch_end(self, state: State, logger: Logger) -> None:
                         file_path=local_symlink_file,
                         overwrite=True,
                     )
+                    self._log_checkpoint_upload(logger)
                     break
                 else:
                     raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}')
         self.symlink_upload_tasks = undone_symlink_upload_tasks
 
     def fit_end(self, state: State, logger: Logger) -> None:
-        del state, logger  # unused
+        del state  # unused
         if self.remote_uploader is None:
             return
         log.info('Waiting for checkpoint uploading to finish')
@@ -666,6 +673,7 @@ def fit_end(self, state: State, logger: Logger) -> None:
                     overwrite=True,
                 )
                 symlink_upload_future.result()
+                self._log_checkpoint_upload(logger)
             else:
                 raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}')
         log.info('Checkpoint uploading finished!')
diff --git a/composer/loggers/remote_uploader_downloader.py b/composer/loggers/remote_uploader_downloader.py
index 9378d5a8d4..a143ac1421 100644
--- a/composer/loggers/remote_uploader_downloader.py
+++ b/composer/loggers/remote_uploader_downloader.py
@@ -22,7 +22,7 @@
 
 import torch
 
-from composer.loggers import Logger, MosaicMLLogger
+from composer.loggers import Logger
 from composer.loggers.logger_destination import LoggerDestination
 from composer.utils import (
     MLFlowObjectStore,
@@ -308,12 +308,13 @@ def remote_backend(self) -> ObjectStore:
         return self._remote_backend
 
     def init(self, state: State, logger: Logger) -> None:
+        del logger  # unused
+
         if self._worker_flag is not None:
             raise RuntimeError('The RemoteUploaderDownloader is already initialized.')
         self._worker_flag = self._finished_cls()
         self._run_name = state.run_name
         file_name_to_test = self._remote_file_name('.credentials_validated_successfully')
-        self._logger = logger
 
         # Create the enqueue thread
         self._enqueue_thread_flag = self._finished_cls()
@@ -426,9 +427,6 @@ def _enqueue_uploads(self):
                         break
                     self._enqueued_objects.remove(object_name)
                     self._completed_queue.task_done()
-                    for destination in self._logger.destinations:
-                        if isinstance(destination, MosaicMLLogger):
-                            destination.log_metadata({'checkpoint_uploaded_time': time.time()}, force_flush=True)
 
                 # Enqueue all objects that are in self._logged_objects but not in self._file_upload_queue
                 objects_to_delete = []

From 69b8b236b6705060130c7ca682a62528d3fba2ac Mon Sep 17 00:00:00 2001
From: Ethan Ma <ethan.ma@databricks.com>
Date: Tue, 16 Jul 2024 12:55:52 -0700
Subject: [PATCH 62/69] Add mosaicmllogger attr for fit start (#3467)

---
 composer/loggers/mosaicml_logger.py   | 5 +++++
 tests/loggers/test_mosaicml_logger.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py
index d7c83b85fa..2de7243d31 100644
--- a/composer/loggers/mosaicml_logger.py
+++ b/composer/loggers/mosaicml_logger.py
@@ -135,6 +135,11 @@ def fit_end(self, state: State, logger: Logger) -> None:
         self.log_metadata(training_progress_data)
         self._flush_metadata(force_flush=True)
 
+    def fit_start(self, state: State, logger: Logger) -> None:
+        # Log model training started time for run events
+        self.log_metadata({'train_started_time': time.time()})
+        self._flush_metadata(force_flush=True)
+
     def eval_end(self, state: State, logger: Logger) -> None:
         self._flush_metadata(force_flush=True)
 
diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py
index 795c8da56b..e308dab122 100644
--- a/tests/loggers/test_mosaicml_logger.py
+++ b/tests/loggers/test_mosaicml_logger.py
@@ -321,6 +321,7 @@ def test_run_events_logged(monkeypatch):
     assert metadata['mosaicml/training_progress'] == '[batch=4/4]'
     assert 'mosaicml/training_sub_progress' not in metadata
     assert isinstance(metadata['mosaicml/train_finished_time'], float)
+    assert isinstance(metadata['mosaicml/train_started_time'], float)
 
 
 def test_token_training_progress_metrics():

From 15c329e260c9bfe2770333f3e64a071fb5c60171 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 17:25:28 +0000
Subject: [PATCH 63/69] Bump coverage[toml] from 7.5.4 to 7.6.0 (#3471)

Bumps [coverage[toml]](https://github.com/nedbat/coveragepy) from 7.5.4 to 7.6.0.
- [Release notes](https://github.com/nedbat/coveragepy/releases)
- [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst)
- [Commits](https://github.com/nedbat/coveragepy/compare/7.5.4...7.6.0)

---
updated-dependencies:
- dependency-name: coverage[toml]
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8f8498392d..eccba856fd 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def package_files(prefix: str, directory: str, extension: str):
     # Should manually update dependency versions occassionally.
     'custom_inherit==2.4.1',
     'junitparser==3.1.2',
-    'coverage[toml]==7.5.4',
+    'coverage[toml]==7.6.0',
     'fasteners==0.18',  # object store tests require fasteners
     'pytest==7.4.4',
     'ipython==8.11.0',

From 8a09a3be711038ddf3bccb88402e2b438ed3208c Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Sun, 21 Jul 2024 15:18:15 -0700
Subject: [PATCH 64/69] Bump flash attention to 2.6.1 (#3476)

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 970af2f1ef..d854715568 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -261,7 +261,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
 RUN if [ -n "$CUDA_VERSION" ] ; then \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \
-        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.5.8; \
+        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.1; \
         cd .. ; \
     fi
 

From 779ff3e9a218f6caa696b032677133938085e79d Mon Sep 17 00:00:00 2001
From: Kevin DeShawn <kevin.deshawn@databricks.com>
Date: Mon, 22 Jul 2024 16:03:50 -0500
Subject: [PATCH 65/69] cpu

---
 .github/workflows/pr-cpu.yaml | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 23129715db..1e2d832e74 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -9,7 +9,8 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       matrix:
         include:
@@ -29,20 +30,21 @@ jobs:
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      composer_package_name: mosaicml
-      container: ${{ matrix.container }}
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      safe_directory: composer
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Run PR CPU Tests
+        uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0
+        with:
+          container: ${{ matrix.container }}
+          name: ${{ matrix.name }}
+          pip_deps: "[all]"
+          pytest_command: ${{ matrix.pytest_command }}
+          pytest_markers: ${{ matrix.markers }}
+          safe_directory: composer
+          composer_package_name: mosaicml
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results
     if: github.repository_owner == 'mosaicml'
     needs: [pytest-cpu]
-    with:
-      download-path: artifacts

From 2c0eac2ad47bd667d68648b72def7cbba903d9f9 Mon Sep 17 00:00:00 2001
From: Kevin DeShawn <kevin.deshawn@databricks.com>
Date: Mon, 22 Jul 2024 16:22:33 -0500
Subject: [PATCH 66/69] gpu

---
 .github/workflows/pr-gpu.yaml | 103 +++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index f6de8908c1..08365b2262 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -9,7 +9,8 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       matrix:
         include:
@@ -18,24 +19,27 @@ jobs:
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-    name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
-    with:
-      composer_package_name: ${{ matrix.composer_package_name }}
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/composer
-      mcloud-timeout: 2250
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 1
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
-
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Run PR GPU Tests
+        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+        with:
+          composer_package_name: ${{ matrix.composer_package_name }}
+          container: ${{ matrix.container }}
+          git_repo: mosaicml/composer
+          mcloud_timeout: 2250
+          name: ${{ matrix.name }}
+          pip_deps: "[all]"
+          pytest_command: ${{ matrix.pytest_command }}
+          pytest_markers: ${{ matrix.markers }}
+          python_version: 3.9
+          gpu_num: 1
+          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
   pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       matrix:
         include:
@@ -44,25 +48,29 @@ jobs:
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-    name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
-    with:
-      composer_package_name: ${{ matrix.composer_package_name }}
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/composer
-      mcloud-timeout: 2250
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 2
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Run PR GPU Tests
+        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+        with:
+          composer_package_name: ${{ matrix.composer_package_name }}
+          container: ${{ matrix.container }}
+          git_repo: mosaicml/composer
+          mcloud_timeout: 2250
+          name: ${{ matrix.name }}
+          pip_deps: "[all]"
+          pytest_command: ${{ matrix.pytest_command }}
+          pytest_markers: ${{ matrix.markers }}
+          python_version: 3.9
+          gpu_num: 2
+          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
 
 
   pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       matrix:
         include:
@@ -71,18 +79,21 @@ jobs:
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-    name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
-    with:
-      composer_package_name: ${{ matrix.composer_package_name }}
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/composer
-      mcloud-timeout: 2250
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 4
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Run PR GPU Tests
+        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+        with:
+          composer_package_name: ${{ matrix.composer_package_name }}
+          container: ${{ matrix.container }}
+          git_repo: mosaicml/composer
+          mcloud_timeout: 2250
+          name: ${{ matrix.name }}
+          pip_deps: "[all]"
+          pytest_command: ${{ matrix.pytest_command }}
+          pytest_markers: ${{ matrix.markers }}
+          python_version: 3.9
+          gpu_num: 4
+          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
\ No newline at end of file

From 2b3ff8c0cbee2e791f087aff92109cb48d50d8fa Mon Sep 17 00:00:00 2001
From: Kevin DeShawn <kevin.deshawn@databricks.com>
Date: Mon, 22 Jul 2024 16:25:42 -0500
Subject: [PATCH 67/69] coverage fix

---
 .github/workflows/pr-cpu.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 1e2d832e74..1303fb54c9 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -48,3 +48,5 @@ jobs:
     name: Coverage Results
     if: github.repository_owner == 'mosaicml'
     needs: [pytest-cpu]
+    with:
+      download-path: artifacts

From bd6515c3e091d4171c31c0bb6caae2be550c3ee1 Mon Sep 17 00:00:00 2001
From: Kevin DeShawn <kevin.deshawn@databricks.com>
Date: Mon, 22 Jul 2024 17:18:14 -0500
Subject: [PATCH 68/69] lint

---
 .github/workflows/pr-cpu.yaml | 24 ++++-----
 .github/workflows/pr-gpu.yaml | 99 ++++++++++++++++++-----------------
 2 files changed, 63 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 1303fb54c9..2e0ac7b20c 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -31,18 +31,18 @@ jobs:
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v2
-      - name: Run PR CPU Tests
-        uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0
-        with:
-          container: ${{ matrix.container }}
-          name: ${{ matrix.name }}
-          pip_deps: "[all]"
-          pytest_command: ${{ matrix.pytest_command }}
-          pytest_markers: ${{ matrix.markers }}
-          safe_directory: composer
-          composer_package_name: mosaicml
+    - name: Checkout code
+      uses: actions/checkout@v2
+    - name: Run PR CPU Tests
+      uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0
+      with:
+        container: ${{ matrix.container }}
+        name: ${{ matrix.name }}
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        safe_directory: composer
+        composer_package_name: mosaicml
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 08365b2262..a2715a5844 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -21,22 +21,23 @@ jobs:
           composer_package_name: mosaicml
     if: github.repository_owner == 'mosaicml'
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      - name: Run PR GPU Tests
-        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
-        with:
-          composer_package_name: ${{ matrix.composer_package_name }}
-          container: ${{ matrix.container }}
-          git_repo: mosaicml/composer
-          mcloud_timeout: 2250
-          name: ${{ matrix.name }}
-          pip_deps: "[all]"
-          pytest_command: ${{ matrix.pytest_command }}
-          pytest_markers: ${{ matrix.markers }}
-          python_version: 3.9
-          gpu_num: 1
-          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+      with:
+        composer_package_name: ${{ matrix.composer_package_name }}
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/composer
+        mcloud_timeout: 2250
+        name: ${{ matrix.name }}
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 1
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: v0.1.0
   pytest-gpu-2:
     name: ${{ matrix.name }}
     runs-on: linux-ubuntu-latest
@@ -50,22 +51,23 @@ jobs:
           composer_package_name: mosaicml
     if: github.repository_owner == 'mosaicml'
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      - name: Run PR GPU Tests
-        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
-        with:
-          composer_package_name: ${{ matrix.composer_package_name }}
-          container: ${{ matrix.container }}
-          git_repo: mosaicml/composer
-          mcloud_timeout: 2250
-          name: ${{ matrix.name }}
-          pip_deps: "[all]"
-          pytest_command: ${{ matrix.pytest_command }}
-          pytest_markers: ${{ matrix.markers }}
-          python_version: 3.9
-          gpu_num: 2
-          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+      with:
+        composer_package_name: ${{ matrix.composer_package_name }}
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/composer
+        mcloud_timeout: 2250
+        name: ${{ matrix.name }}
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 2
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: v0.1.0
 
 
   pytest-gpu-4:
@@ -81,19 +83,20 @@ jobs:
           composer_package_name: mosaicml
     if: github.repository_owner == 'mosaicml'
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      - name: Run PR GPU Tests
-        uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
-        with:
-          composer_package_name: ${{ matrix.composer_package_name }}
-          container: ${{ matrix.container }}
-          git_repo: mosaicml/composer
-          mcloud_timeout: 2250
-          name: ${{ matrix.name }}
-          pip_deps: "[all]"
-          pytest_command: ${{ matrix.pytest_command }}
-          pytest_markers: ${{ matrix.markers }}
-          python_version: 3.9
-          gpu_num: 4
-          mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
\ No newline at end of file
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0
+      with:
+        composer_package_name: ${{ matrix.composer_package_name }}
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/composer
+        mcloud_timeout: 2250
+        name: ${{ matrix.name }}
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 4
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: v0.1.0
\ No newline at end of file

From 4488555a93498ffa28511630a787746a85cd47c5 Mon Sep 17 00:00:00 2001
From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:53:20 -0500
Subject: [PATCH 69/69] Update pr-cpu.yaml

---
 .github/workflows/pr-cpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 2e0ac7b20c..9636f87352 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -1,6 +1,6 @@
 name: PR CPU tests
 on:
-  pull_request:
+  pull_request_target:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
 # or dev