From 081dabb0c5bdf297e0082c4c8a0f7e15070b2eca Mon Sep 17 00:00:00 2001
From: Evan Racah <evan.racah@databricks.com>
Date: Tue, 6 Aug 2024 13:21:08 -0700
Subject: [PATCH 01/12] Fix autoresume docstring (save_overwrite) (#3526)

save_overwrite is no longer required to be false for autoresume
---
 composer/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 8b1c6d8f93..b2f829ca10 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -975,7 +975,7 @@ class Trainer:
             (default: ``False``)
         autoresume (bool, optional): Whether or not to enable autoresume, which allows for stopping and resuming
             training. This allows use of spot instances, as the training run is now fault tolerant.  This parameter
-            requires ``save_folder`` and ``run_name`` to be specified and ``save_overwrite`` to be ``False``.
+            requires ``save_folder`` and ``run_name`` to be specified.
             (default: ``False``)
 
             When enabled, the save_folder is checked for checkpoints of the format ``"{save_folder}/{save_latest_filename}"``,

From bd7227c0c8205534bcd6ed17f646ad0a0267a2b3 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:45:34 -0700
Subject: [PATCH 02/12] Unpin pip (#3524)

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 12ea07bd51..80ae8bad2e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -172,7 +172,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \
-    pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' 'setuptools<70.0.0'
+    pip${PYTHON_VERSION} install --no-cache-dir --upgrade pip 'setuptools<70.0.0'
 
 #################
 # Install Pytorch

From a15b18ce18db6df4cff4e10251adcb9d5c5845db Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Wed, 7 Aug 2024 10:18:54 -0700
Subject: [PATCH 03/12] Add FSDP input validation for use_orig_params and
 activation_cpu_offload flag (#3515)

---
 composer/core/state.py     |  4 ++++
 tests/trainer/test_fsdp.py | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/composer/core/state.py b/composer/core/state.py
index 5c429a1cd4..ca20dd1011 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -640,6 +640,10 @@ def _validate_parallelism_configs(self):
             if error_message != '':
                 raise ValueError(error_message)
 
+        # Validate FSDP config parameters.
+        if self.fsdp_config and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params:
+            raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.')
+
         # Validate FSDP state dict type
         if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
             if self.fsdp_state_dict_type == 'local':
diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py
index 4c936f5402..7b9bd4825c 100644
--- a/tests/trainer/test_fsdp.py
+++ b/tests/trainer/test_fsdp.py
@@ -621,6 +621,28 @@ def test_fsdp_shard(world_size: int):
     )
 
 
+@pytest.mark.gpu
+@world_size(2)
+def test_fsdp_invalid_config_throws_error(world_size: int):
+    model = SimpleModel()
+    model.fc1._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    model.fc2._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+
+    expected_error = 'activation_cpu_offload=True is not supported with use_orig_params=False.'
+
+    with pytest.raises(ValueError, match=expected_error):
+        _ = Trainer(
+            model=model,
+            parallelism_config={
+                'fsdp': {
+                    'use_orig_params': False,
+                    'activation_cpu_offload': True,
+                },
+            },
+            max_duration='3ba',
+        )
+
+
 @pytest.mark.gpu
 @world_size(2)
 def test_fsdp_shard_and_replicate(world_size: int):

From 4a9756f11c51dae01daba6c3bddf9d94169f1d89 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 8 Aug 2024 11:23:35 -0700
Subject: [PATCH 04/12] hasattr check for Wandb 0.17.6 (#3531)

---
 .github/workflows/daily.yaml     | 3 ---
 composer/loggers/wandb_logger.py | 6 +++++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index aa97c755c8..d7616ffbbf 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -77,13 +77,10 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       composer_package_name: ${{ matrix.composer_package_name }}
-      pytest-wandb-entity: "mosaicml-public-integration-tests"
-      pytest-wandb-project: "integration-tests-${{ github.sha }}"
       safe_directory: composer
     secrets:
       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      wandb-api-key: ${{ secrets.WANDB_API_KEY }}
       code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
       code-eval-url: ${{ secrets.CODE_EVAL_URL }}
       code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py
index 4a2afe5c84..d76ee1fbac 100644
--- a/composer/loggers/wandb_logger.py
+++ b/composer/loggers/wandb_logger.py
@@ -200,7 +200,11 @@ def init(self, state: State, logger: Logger) -> None:
         if self._enabled:
             wandb.init(**self._init_kwargs)
             assert wandb.run is not None, 'The wandb run is set after init'
-            entity_and_project = [str(wandb.run.entity), str(wandb.run.project)]
+            if hasattr(wandb.run, 'entity') and hasattr(wandb.run, 'project'):
+                entity_and_project = [str(wandb.run.entity), str(wandb.run.project)]
+            else:
+                # Run does not have attribtues if wandb is in disabled mode, so we must mock it
+                entity_and_project = ['disabled', 'disabled']
             self.run_dir = wandb.run.dir
             self.run_url = wandb.run.get_url()
             atexit.register(self._set_is_in_atexit)

From f6c00b8292c34280fa1297a14ef1c3cee5f12e22 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 8 Aug 2024 12:42:24 -0700
Subject: [PATCH 05/12] Fix FSDP Config Validation (#3530)

---
 composer/core/state.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index ca20dd1011..cbd7fc41db 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -641,12 +641,12 @@ def _validate_parallelism_configs(self):
                 raise ValueError(error_message)
 
         # Validate FSDP config parameters.
-        if self.fsdp_config and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params:
+        if self.fsdp_config is not None and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params:
             raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.')
 
         # Validate FSDP state dict type
-        if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
-            if self.fsdp_state_dict_type == 'local':
+        if self.fsdp_config is not None and self.fsdp_config.state_dict_type not in [None, 'full', 'sharded']:
+            if self.fsdp_config.state_dict_type == 'local':
                 raise ValueError(
                     'Composer and PyTorch no longer support saving or loading local state dicts. '
                     'To upgrade an older checkpoint, use Composer version 0.18.1 and export as '
@@ -654,7 +654,7 @@ def _validate_parallelism_configs(self):
                 )
             raise ValueError(
                 f'fsdp_state_dict_type must be one of [None, "full", "sharded"], but got '
-                f'{self.fsdp_state_dict_type}',
+                f'{self.fsdp_config.state_dict_type}',
             )
         if self.fsdp_sharded_state_dict_enabled and self.save_metrics:
             # Sharded state dict breaks in many different ways with torchmetrics, due to both sharding

From 14f5445f94ab3b558d890339fe2fe5e57d41dcca Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 8 Aug 2024 13:18:17 -0700
Subject: [PATCH 06/12] Remove dev on github workflows (#3536)

---
 .github/PULL_REQUEST_TEMPLATE.md    | 6 +++---
 .github/workflows/code-quality.yaml | 1 -
 .github/workflows/daily.yaml        | 1 -
 .github/workflows/pr-docker.yaml    | 1 -
 .github/workflows/smoketest.yaml    | 1 -
 5 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 8b249a5ccf..a5e905ebc7 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,13 +16,13 @@ Example:
 -->
 
 # Before submitting
-- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)?
+- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)?
 - [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist.
 - [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so.
 - [ ] Did you update any related docs and document your change?
-- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests))
+- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests))
 - [ ] Did you run the tests locally to make sure they pass?
-- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites))
+- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites))
 
 <!--
 Thanks so much for contributing to composer! We really appreciate it :)
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 432e031cb4..7e0ba468c1 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -2,7 +2,6 @@ name: Code Quality Checks
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index d7616ffbbf..b64e68d493 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -4,7 +4,6 @@ on:
   - cron: "30 2 * * *"  # 2:30 every day
   push:
     branches:
-    - dev
     - main
     - release/**
   workflow_dispatch:
diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml
index 93f0b51be1..352eab881b 100644
--- a/.github/workflows/pr-docker.yaml
+++ b/.github/workflows/pr-docker.yaml
@@ -2,7 +2,6 @@ name: PR Docker
 on:
   pull_request:
     branches:
-    - dev
     - main
     - release/**
     paths:
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index b7bb09aaab..a08bfe7d53 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -2,7 +2,6 @@ name: Smoketest
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:

From bad3f0c59516d97967101c825ee92e9dfc1868c2 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 8 Aug 2024 16:40:35 -0700
Subject: [PATCH 07/12] Remove dev branch in GPU workflows (#3539)

---
 .github/workflows/code-quality.yaml | 1 +
 .github/workflows/pr-gpu.yaml       | 2 +-
 .github/workflows/smoketest.yaml    | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 7e0ba468c1..5a2c86221f 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -18,6 +18,7 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index f6de8908c1..392a2665c8 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -6,7 +6,7 @@ on:
 # or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-gpu-1:
     uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index a08bfe7d53..a0a6c445d2 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -19,6 +19,7 @@ jobs:
   smoketest:
     runs-on: ubuntu-20.04
     timeout-minutes: 10
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:

From e9aee74eb8e29eceddfb7e449eb49874f972adb7 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Fri, 9 Aug 2024 10:01:29 -0700
Subject: [PATCH 08/12] restore google cloud object store test (#3538)

---
 tests/utils/object_store/test_gs_object_store.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/utils/object_store/test_gs_object_store.py b/tests/utils/object_store/test_gs_object_store.py
index b52d3cf977..93b8a05ccd 100644
--- a/tests/utils/object_store/test_gs_object_store.py
+++ b/tests/utils/object_store/test_gs_object_store.py
@@ -9,7 +9,6 @@
 from botocore.exceptions import ClientError
 from torch.utils.data import DataLoader
 
-from composer.loggers import RemoteUploaderDownloader
 from composer.optim import DecoupledSGDW
 from composer.trainer import Trainer
 from composer.utils import GCSObjectStore
@@ -17,15 +16,15 @@
 
 
 def get_gcs_os_from_trainer(trainer: Trainer) -> GCSObjectStore:
-    rud = [dest for dest in trainer.logger.destinations if isinstance(dest, RemoteUploaderDownloader)][0]
-    gcs_os = rud.remote_backend
+    assert trainer._checkpoint_saver is not None
+    assert trainer._checkpoint_saver.remote_uploader is not None
+    gcs_os = trainer._checkpoint_saver.remote_uploader.remote_backend
     assert isinstance(gcs_os, GCSObjectStore)
     return gcs_os
 
 
 @pytest.mark.gpu  # json auth is hard to set up on github actions / CPU tests
 @pytest.mark.remote
-@pytest.mark.skip(reason='Waiting for new GCP key to be approved')
 def test_gs_object_store_integration_hmac_auth(expected_use_gcs_sdk_val=False, client_should_be_none=True):
     model = SimpleModel()
     train_dataset = RandomClassificationDataset()
@@ -35,7 +34,7 @@ def test_gs_object_store_integration_hmac_auth(expected_use_gcs_sdk_val=False, c
         model=model,
         optimizers=optimizer,
         train_dataloader=train_dataloader,
-        save_folder='gs://mosaicml-internal-integration-testing/checkpoints/{run_name}',
+        save_folder='gs://mosaicml-runtime-internal-integration-testing/checkpoints/{run_name}',
         save_filename='test-model.pt',
         max_duration='1ba',
         precision='amp_bf16',
@@ -54,7 +53,7 @@ def test_gs_object_store_integration_hmac_auth(expected_use_gcs_sdk_val=False, c
         model=model,
         optimizers=optimizer,
         train_dataloader=train_dataloader,
-        load_path=f'gs://mosaicml-internal-integration-testing/checkpoints/{run_name}/test-model.pt',
+        load_path=f'gs://mosaicml-runtime-internal-integration-testing/checkpoints/{run_name}/test-model.pt',
         max_duration='2ba',
         precision='amp_bf16',
     )
@@ -64,7 +63,6 @@ def test_gs_object_store_integration_hmac_auth(expected_use_gcs_sdk_val=False, c
 
 @pytest.mark.gpu
 @pytest.mark.remote
-@pytest.mark.skip(reason='Waiting for new GCP key to be approved')
 def test_gs_object_store_integration_json_auth():
     with mock.patch.dict(os.environ):
         if 'GCS_KEY' in os.environ:

From 88caae9f69a1b5414a09fdf8d421dc6c1c1139f0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 9 Aug 2024 14:42:53 -0700
Subject: [PATCH 09/12] Update moto[s3] requirement from <5,>=4.0.1 to
 >=4.0.1,<6 (#3516)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: bigning <ning.wang@databricks.com>
---
 setup.py                                          | 2 +-
 tests/utils/object_store/object_store_settings.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9571b5e118..11c82b5a37 100644
--- a/setup.py
+++ b/setup.py
@@ -138,7 +138,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'pandoc==2.3',
     'pypandoc==1.13',
     'GitPython==3.1.43',
-    'moto[s3]>=4.0.1,<5',
+    'moto[s3]>=5.0.1,<6',
     'mock-ssh-server==0.9.1',
     'cryptography==42.0.8',
     'pytest-httpserver>=1.0.4,<1.1',
diff --git a/tests/utils/object_store/object_store_settings.py b/tests/utils/object_store/object_store_settings.py
index f11cf853b7..ac385603ba 100644
--- a/tests/utils/object_store/object_store_settings.py
+++ b/tests/utils/object_store/object_store_settings.py
@@ -89,7 +89,7 @@ def get_object_store_ctx(
             monkeypatch.setenv('AWS_SECURITY_TOKEN', 'testing')
             monkeypatch.setenv('AWS_SESSION_TOKEN', 'testing')
             monkeypatch.setenv('AWS_DEFAULT_REGION', 'us-east-1')
-            with moto.mock_s3():
+            with moto.mock_aws():
                 # create the dummy bucket
                 s3 = boto3.client('s3')
                 s3.create_bucket(Bucket=object_store_kwargs['bucket'])

From 1320825b5877924186f6c726289a98184b1fd027 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Mon, 12 Aug 2024 09:59:57 -0700
Subject: [PATCH 10/12] use s3 boto3 Adaptive retry as default retry mode
 (#3543)

---
 composer/utils/object_store/s3_object_store.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/composer/utils/object_store/s3_object_store.py b/composer/utils/object_store/s3_object_store.py
index 0e39b19e7a..9dbaa6d544 100644
--- a/composer/utils/object_store/s3_object_store.py
+++ b/composer/utils/object_store/s3_object_store.py
@@ -92,6 +92,8 @@ def __init__(
 
         if client_config is None:
             client_config = {}
+        if 'retries' not in client_config:
+            client_config['retries'] = {'mode': 'adaptive'}
         config = Config(**client_config)
         if 'S3_ENDPOINT_URL' in os.environ and endpoint_url is None:
             endpoint_url = os.environ['S3_ENDPOINT_URL']

From a10a798b0a83e2be6d0955dba773ac03db5194c5 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 15:29:22 -0700
Subject: [PATCH 11/12] Bump to Pytorch 2.4  (#3542)

Co-authored-by: Chuck Tang <chuck.tang@databricks.com>
---
 .github/workflows/daily.yaml                  |  28 --
 .../docker-configure-build-push.yaml          |   2 +-
 .github/workflows/pr-cpu.yaml                 |   4 -
 .../ghost_batchnorm/ghost_batchnorm.py        |   4 +-
 composer/algorithms/swa/swa.py                |   2 +-
 composer/callbacks/image_visualizer.py        |  10 +-
 composer/callbacks/memory_snapshot.py         |   9 +-
 composer/callbacks/oom_observer.py            |   9 +-
 composer/core/state.py                        |  19 +-
 composer/distributed/dist_strategy.py         | 335 ++++++------------
 composer/distributed/mosaic_parallelism.py    |   7 +-
 composer/profiler/torch_profiler.py           |  69 ++--
 composer/profiler/utils.py                    |   8 +-
 composer/trainer/_patch_pytorch.py            | 168 +--------
 composer/trainer/trainer.py                   |   1 -
 composer/utils/checkpoint.py                  |  31 +-
 composer/utils/dist.py                        |   8 +-
 docker/Dockerfile                             |   9 +-
 docker/README.md                              |  12 +-
 docker/build_matrix.yaml                      | 143 +++-----
 docker/generate_build_matrix.py               |  14 +-
 setup.py                                      |   4 +-
 tests/trainer/test_fsdp_checkpoint.py         |  17 +-
 tests/utils/test_inference.py                 |   2 +-
 24 files changed, 276 insertions(+), 639 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index b64e68d493..ee94e89c2b 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -17,11 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -42,11 +37,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -102,12 +92,6 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-        - name: "gpu-3.10-2.1-1-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -120,12 +104,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.10-2.1-2-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 2
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -138,12 +116,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.10-2.1-4-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 4
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml
index 2b6bf4893d..a668e75217 100644
--- a/.github/workflows/docker-configure-build-push.yaml
+++ b/.github/workflows/docker-configure-build-push.yaml
@@ -36,7 +36,7 @@ on:
         required: true
 jobs:
   configure-build-push:
-    runs-on: ubuntu-latest
+    runs-on: mosaic-4wide
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 23129715db..4d44e69824 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -13,10 +13,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
index 3943a1c345..92aed98808 100644
--- a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
+++ b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
         has_momentum: bool = hasattr(self.batchnorm, 'momentum')
-        original_momentum: float = self.batchnorm.momentum
+        original_momentum: Optional[float] = self.batchnorm.momentum
 
         if self.training and has_momentum:
             # applying the same batchnorm multiple times greatly increases
@@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]
 
         if self.training and has_momentum:
+            assert original_momentum is not None
             self._unscale_momentum(original_momentum)
 
         return torch.cat(normalized_chunks, dim=0)
@@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat
 
     @torch.jit.unused
     def _scale_momentum(self, nchunks: int):
+        assert self.batchnorm.momentum is not None
         self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks
 
     @torch.jit.unused
diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
index 4177168a13..dd9826d44d 100644
--- a/composer/algorithms/swa/swa.py
+++ b/composer/algorithms/swa/swa.py
@@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
                 state.optimizers[0],
                 swa_lr=self.swa_lr,
                 anneal_epochs=self.anneal_steps,
-                anneal_strategy=self.anneal_strategy,
+                anneal_strategy=self.anneal_strategy,  # type: ignore
             )
 
         self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
index e8381a944c..d86a2d97bc 100644
--- a/composer/callbacks/image_visualizer.py
+++ b/composer/callbacks/image_visualizer.py
@@ -164,18 +164,18 @@ def _make_segmentation_images(
     # Ensure the targets are in the expected format
     if infer_target_type(outputs, targets) == 'one_hot':
         if channels_last:
-            targets = targets.argmax(dim=-1).data.cpu().numpy()
+            targets = targets.argmax(dim=-1).data.cpu().numpy()  # type: ignore
         else:
-            targets = targets.argmax(dim=1).data.cpu().numpy()
+            targets = targets.argmax(dim=1).data.cpu().numpy()  # type: ignore
     else:
-        targets = targets.data.cpu().numpy()
+        targets = targets.data.cpu().numpy()  # type: ignore
     # Convert the outputs to the expected format
     if channels_last:
         num_classes = outputs.shape[-1]
-        outputs = outputs.argmax(dim=-1).cpu().numpy()
+        outputs = outputs.argmax(dim=-1).cpu().numpy()  # type: ignore
     else:
         num_classes = outputs.shape[1]
-        outputs = outputs.argmax(dim=1).cpu().numpy()
+        outputs = outputs.argmax(dim=1).cpu().numpy()  # type: ignore
     # Adjust targets such that negative values are mapped to one higher than the maximum class
     targets[targets < 0] = num_classes
 
diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
index 767f3abb0f..328d781d81 100644
--- a/composer/callbacks/memory_snapshot.py
+++ b/composer/callbacks/memory_snapshot.py
@@ -9,7 +9,6 @@
 from typing import Optional, Union
 
 import torch.cuda
-from packaging import version
 
 from composer import State
 from composer.core import Callback, State, Time, TimeUnit
@@ -94,13 +93,7 @@ def __init__(
             _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
         else:
             self.remote_path_in_bucket = None
-
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+        self._enabled = True
 
     def init(self, state: State, logger: Logger) -> None:
         if not self._enabled:
diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
index d43685bab7..d85b4ec6ca 100644
--- a/composer/callbacks/oom_observer.py
+++ b/composer/callbacks/oom_observer.py
@@ -14,7 +14,6 @@
 from typing import Optional
 
 import torch.cuda
-from packaging import version
 
 from composer.core import Callback, State
 from composer.loggers import Logger
@@ -113,13 +112,7 @@ def __init__(
         else:
             self.remote_path_in_bucket = None
 
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # OOMObserver is only supported in torch v2.1.0 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
-
+        self._enabled = True
         self.filename_config: Optional[SnapshotFileNameConfig] = None
 
     def init(self, state: State, logger: Logger) -> None:
diff --git a/composer/core/state.py b/composer/core/state.py
index cbd7fc41db..7c43473ace 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the model.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the optimizer.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1327,7 +1331,9 @@ def load_model_state(
         model_on_rank = state_dict['model'] is not None
 
         if model_on_rank:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
                 try:
                     set_model_state_dict(
@@ -1430,14 +1436,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 continue
 
             optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict
 
                 # optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                if version.parse(torch.__version__) < version.parse('2.4.0'):
+                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,
diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
index f7adc79428..1b09a9fd74 100644
--- a/composer/distributed/dist_strategy.py
+++ b/composer/distributed/dist_strategy.py
@@ -3,7 +3,6 @@
 
 """Helpers for running distributed data parallel training."""
 
-import collections
 import logging
 import warnings
 from contextlib import contextmanager, nullcontext
@@ -15,16 +14,17 @@
     CheckpointImpl,
     apply_activation_checkpointing,
     checkpoint_wrapper,
+    offload_wrapper,
 )
 from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
 from torch.distributed.fsdp._common_utils import clean_tensor_name
+from torch.distributed.fsdp.wrap import CustomPolicy
 from torch.nn.parallel import DistributedDataParallel
 from torchmetrics import Metric, MetricCollection
 
 from composer.core import Precision, State
 from composer.core.precision import _validate_precision
 from composer.devices import Device, DeviceGPU
-from composer.distributed.meta_safe_apply import meta_safe_apply
 from composer.distributed.mosaic_parallelism import (
     BACKWARD_PREFETCH_MAP,
     SHARDING_MAP,
@@ -397,177 +397,73 @@ def sync_hook(*args):
             if hasattr(obj, '_fsdp_wrap') and not bool(obj._fsdp_wrap):
                 continue
 
-            # Rather than verifying these changes with older PyTorch versions, we are fixing forward here
-            if version.parse(torch.__version__) > version.parse('2.1.0'):
-                # A dictionary of all tied parameter pointers to (module, attr) tuples
-                tied_pointers = {}
-
-                # Goes through all modules finding which weights have the same pointers
-                for mod in obj.modules():
-                    for attr_name, attr in mod.named_parameters(recurse=False):
-                        ptr = id(attr)
-                        mod_attr_list = tied_pointers.get(ptr, [])
-                        mod_attr_list.append((mod, attr_name))
-                        tied_pointers[ptr] = mod_attr_list
-
-                # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples
-                source_mod_to_mod_attr = {}
-                for mod_attr_list in tied_pointers.values():
-                    # If there is only one module for this pointer, then there is no weight tying
-                    if len(mod_attr_list) == 1:
-                        continue
-
-                    # Arbitrarily choose the first module as the source module
-                    first_mod, first_attr = mod_attr_list[0]
-                    source_mod_to_mod_attr[first_mod] = [
-                        (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:]
-                    ]
-
-                # Clean up no longer needed module references for memory safety
-                del tied_pointers
-
-                def _param_init_fn(module: torch.nn.Module) -> None:
-                    # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function.
-                    # It is assumed that whatever process moved the parameters off of meta device initialized them.
-                    # We expect this to occur if we have tied weights, as the second module will already have the weights initialized.
-                    is_meta = any(param.is_meta for param in module.parameters(recurse=False)
-                                 ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
-                    if not is_meta:
-                        return
-
-                    # Move all parameters and buffers to the current device
-                    module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False)
-
-                    # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device
-                    if module in source_mod_to_mod_attr:
-                        for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]:
-                            setattr(target_mod, dest_attr, getattr(module, first_attr))
-
-                    # Run the specified initialization
-                    if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
-                        obj.param_init_fn(module)
-                    elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
-                        module.reset_parameters()
-                    else:
-                        raise ValueError(
-                            f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
-                            'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
-                            f'to module `{obj_name}`.',
-                        )
-            else:
-
-                def _param_init_fn(module: torch.nn.Module) -> None:
-                    # A dictionary of all tied parameter pointers to module names
-                    tied_pointers = {}
-
-                    # Goes through all modules finding which weights have the same pointers
-                    for name, mod in module.named_modules():
-                        # Since FSDP recursively wraps, at parent modules we can encounter already
-                        # wrapped weights, as a result we should skip any modules with `_fsdp_wrapped_module.`
-                        if '_fsdp_wrapped_module' in name:
-                            continue
-                        for attr in ['weight', 'bias']:
-                            if hasattr(mod, attr):
-                                mod_attr = getattr(mod, attr)
-                                if mod_attr is None:
-                                    continue
-                                ptr = id(mod_attr)
-                                ptr_attr = (ptr, attr)
-                                name_list = tied_pointers.get(ptr_attr, [])
-                                name_list.append(name)
-                                tied_pointers[ptr_attr] = name_list
-
-                    # Creates a dictionary of module names that should be tied together
-                    tied_mod_names = collections.defaultdict(list)
-                    # Creates a set of modules we should not initialize
-                    should_not_init_params = set()
-                    for ptr_attr_type, mod_names in tied_pointers.items():
-                        # No modules for this pointer are tied
-                        if len(mod_names) == 1:
-                            continue
-                        _, attr_type = ptr_attr_type
-                        first = next(mod_names.__iter__())
-                        for elem in mod_names:
-                            should_not_init_params.add('.'.join([elem, attr_type]))
-                            tied_mod_names[(first, attr_type)].append(elem)
-                        # Make sure at least one of the tied parameters is initialized
-                        should_not_init_params.remove('.'.join([first, attr_type]))
-
-                    meta_safe_apply(
-                        module,
-                        lambda t: torch.empty_like(t, device=f'cuda:{torch.cuda.current_device()}'),
-                        should_not_init_params,
-                        module_name='',
+            # A dictionary of all tied parameter pointers to (module, attr) tuples
+            tied_pointers = {}
+
+            # Goes through all modules finding which weights have the same pointers
+            for mod in obj.modules():
+                for attr_name, attr in mod.named_parameters(recurse=False):
+                    ptr = id(attr)
+                    mod_attr_list = tied_pointers.get(ptr, [])
+                    mod_attr_list.append((mod, attr_name))
+                    tied_pointers[ptr] = mod_attr_list
+
+            # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples
+            source_mod_to_mod_attr = {}
+            for mod_attr_list in tied_pointers.values():
+                # If there is only one module for this pointer, then there is no weight tying
+                if len(mod_attr_list) == 1:
+                    continue
+
+                # Arbitrarily choose the first module as the source module
+                first_mod, first_attr = mod_attr_list[0]
+                source_mod_to_mod_attr[first_mod] = [
+                    (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:]
+                ]
+
+            # Clean up no longer needed module references for memory safety
+            del tied_pointers
+
+            def _param_init_fn(module: torch.nn.Module) -> None:
+                # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function.
+                # It is assumed that whatever process moved the parameters off of meta device initialized them.
+                # We expect this to occur if we have tied weights, as the second module will already have the weights initialized.
+                is_meta = any(param.is_meta for param in module.parameters(recurse=False)
+                             ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
+                if not is_meta:
+                    return
+
+                # Move all parameters and buffers to the current device
+                module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False)
+
+                # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device
+                if module in source_mod_to_mod_attr:
+                    for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]:
+                        setattr(target_mod, dest_attr, getattr(module, first_attr))
+
+                # Run the specified initialization
+                if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
+                    obj.param_init_fn(module)
+                elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
+                    module.reset_parameters()
+                else:
+                    raise ValueError(
+                        f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
+                        'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
+                        f'to module `{obj_name}`.',
                     )
 
-                    if len(tied_mod_names) > 0:
-                        warnings.warn((
-                            'The passed in model appears to have tied weights. In order to '
-                            'support effective weight tying, the tied modules need to be '
-                            'in the same FSDP module. If the weights are not properly tied '
-                            'it can lead to loss spikes. We have tried our best to ensure '
-                            'the tied weights are in the same FSDP module.'
-                        ))
-
-                    # Redoes weight tying
-                    for name_attr, tied_names in tied_mod_names.items():
-                        name, attr = name_attr
-                        src_mod = module.get_submodule(name)
-                        # We need to make sure the source and destination
-                        # modules end up in the same FSDP module otherwise
-                        # with sharding weight tying gets violated
-                        src_mod._fsdp_wrap = False  # type: ignore
-                        src_params = getattr(src_mod, attr)
-                        for tied_name in tied_names:
-                            dest_mod = module.get_submodule(tied_name)
-                            dest_mod._fsdp_wrap = False  # type: ignore
-                            setattr(dest_mod, attr, src_params)
-
-                    if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
-                        module.apply(obj.param_init_fn)
-                    elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
-                        module.reset_parameters()
-                    else:
-                        raise ValueError(
-                            f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
-                            'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
-                            f'to module `{obj_name}`.',
-                        )
-
-            if version.parse(torch.__version__) > version.parse('2.1.0.dev'):
-                # CustomPolicy is only supported in torch v2.1.0-rc1 or higher
-                from torch.distributed.fsdp.wrap import CustomPolicy  # type: ignore
-
-                def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]:
-                    ret = False
-                    if hasattr(module, '_fsdp_wrap'):
-                        ret = bool(module._fsdp_wrap)
-                    elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
-                        ret = obj.fsdp_wrap_fn(module)
-                        if isinstance(ret, dict):
-                            ret = set_custom_fsdp_module_kwargs(ret, process_group_cache)
-                    return ret
-
-                _auto_wrap_policy = CustomPolicy(lambda_fn)
-            else:
-                # Choose which modules to FSDP wrap according to the following priority:
-                # If module has attribute `module._fsdp_wrap = ...`, always respect it
-                # Otherwise wrap if root object `obj.fsdp_wrap_fn(module)` is true.
-                def __auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-                    if recurse:
-                        return True
-                    should_be_wrapped = False
-                    if hasattr(module, '_fsdp_wrap'):
-                        should_be_wrapped = bool(module._fsdp_wrap)
-                    elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
-                        should_be_wrapped = obj.fsdp_wrap_fn(module)
-
-                    return should_be_wrapped
-
-                def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-                    return __auto_wrap_policy(module, recurse, nonwrapped_numel)
+            def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]:
+                ret = False
+                if hasattr(module, '_fsdp_wrap'):
+                    ret = bool(module._fsdp_wrap)
+                elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
+                    ret = obj.fsdp_wrap_fn(module)
+                    if isinstance(ret, dict):
+                        ret = set_custom_fsdp_module_kwargs(ret, process_group_cache)
+                return ret
 
-                _auto_wrap_policy = _auto_wrap_policy_new
+            _auto_wrap_policy = CustomPolicy(lambda_fn)
 
             fsdp_obj = FullyShardedDataParallel(
                 obj,
@@ -640,75 +536,50 @@ def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_num
                 # FP8 TE requires using the TE checkpoint function, FSDP activation checkpointing only works with TE non-reentrant checkpointing
                 if te_checkpoint_wrapper:
                     assert not activation_checkpointing_reentrant, 'TE checkpoint only works with non-reentrant checkpointing'
-                if version.parse(torch.__version__) > version.parse('2.1.0.dev'):
-                    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper
-                    if not activation_checkpointing_reentrant:
-                        if te_checkpoint_wrapper:
-                            try:
-                                import transformer_engine.pytorch as te
-                            except ModuleNotFoundError:
-                                raise ModuleNotFoundError(
-                                    'Please install transformer-engine to use TE checkpoint wrapper',
-                                )
-
-                            # RNG state tracker for checkpointing
-                            CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker()
-                            CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed)
-
-                            def get_cuda_rng_tracker():
-                                return CUDA_RNG_STATES_TRACKER
-
-                            first_wrap_fn = lambda m: checkpoint_wrapper(
-                                m,
-                                context_fn=te.distributed.get_activation_recompute_contexts,
-                                checkpoint_fn=te.distributed.checkpoint,
-                                use_reentrant=False,
-                                get_rng_state_tracker=get_cuda_rng_tracker,
-                            )
-                        else:
-                            first_wrap_fn = lambda m: checkpoint_wrapper(
-                                m,
-                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                            ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: offload_wrapper(
-                                first_wrap_fn(module)
-                                if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                    else:
+                if not activation_checkpointing_reentrant:
+                    if te_checkpoint_wrapper:
+                        try:
+                            import transformer_engine.pytorch as te
+                        except ModuleNotFoundError:
+                            raise ModuleNotFoundError('Please install transformer-engine to use TE checkpoint wrapper',)
+
+                        # RNG state tracker for checkpointing
+                        CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker()
+                        CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed)
+
+                        def get_cuda_rng_tracker():
+                            return CUDA_RNG_STATES_TRACKER
 
                         first_wrap_fn = lambda m: checkpoint_wrapper(
                             m,
-                            checkpoint_impl=CheckpointImpl.REENTRANT,
-                        ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: offload_wrapper(
-                                first_wrap_fn(module)
-                                if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                else:
-                    if not activation_checkpointing_reentrant:
+                            context_fn=te.distributed.get_activation_recompute_contexts,
+                            checkpoint_fn=te.distributed.checkpoint,
+                            use_reentrant=False,
+                            get_rng_state_tracker=get_cuda_rng_tracker,
+                        )
+                    else:
                         first_wrap_fn = lambda m: checkpoint_wrapper(
                             m,
                             checkpoint_impl=CheckpointImpl.NO_REENTRANT,
                         ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: checkpoint_wrapper(
-                                first_wrap_fn(module),  # type: ignore reportGeneralTypeIssues
-                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                                offload_to_cpu=True,
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                    else:
-                        first_wrap_fn = checkpoint_wrapper if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: checkpoint_wrapper(
-                                first_wrap_fn(module),  # type: ignore reportGeneralTypeIssues
-                                offload_to_cpu=True,
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
+                    second_wrap_fn = (
+                        lambda module: offload_wrapper(
+                            first_wrap_fn(module)
+                            if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
+                        )
+                    ) if activation_cpu_offload else first_wrap_fn
+                else:
+
+                    first_wrap_fn = lambda m: checkpoint_wrapper(
+                        m,
+                        checkpoint_impl=CheckpointImpl.REENTRANT,
+                    ) if activation_checkpointing else (lambda module: module)
+                    second_wrap_fn = (
+                        lambda module: offload_wrapper(
+                            first_wrap_fn(module)
+                            if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
+                        )
+                    ) if activation_cpu_offload else first_wrap_fn
 
                 # Choose which modules to activation checkpoint according to the following priority:
                 # If module has attribute `module._activation_checkpointing = ...`, always respect it
diff --git a/composer/distributed/mosaic_parallelism.py b/composer/distributed/mosaic_parallelism.py
index 66c06d911b..0fa6a0547c 100644
--- a/composer/distributed/mosaic_parallelism.py
+++ b/composer/distributed/mosaic_parallelism.py
@@ -8,7 +8,6 @@
 from typing import Any, Union
 
 import torch
-from packaging import version
 from torch import distributed
 from torch.distributed import ProcessGroup
 from torch.distributed.fsdp import (
@@ -27,12 +26,10 @@
     'NO_SHARD': ShardingStrategy.NO_SHARD,
     'SHARD_GRAD_OP': ShardingStrategy.SHARD_GRAD_OP,
     'FULL_SHARD': ShardingStrategy.FULL_SHARD,
+    '_HYBRID_SHARD_ZERO2': ShardingStrategy._HYBRID_SHARD_ZERO2,
+    'HYBRID_SHARD': ShardingStrategy.HYBRID_SHARD,
 }
 
-if version.parse(torch.__version__) >= version.parse('2.1.0'):
-    SHARDING_MAP['_HYBRID_SHARD_ZERO2'] = ShardingStrategy._HYBRID_SHARD_ZERO2
-    SHARDING_MAP['HYBRID_SHARD'] = ShardingStrategy.HYBRID_SHARD
-
 BACKWARD_PREFETCH_MAP = {
     'NONE': None,
     'BACKWARD_PRE': BackwardPrefetch.BACKWARD_PRE,
diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py
index 883ba2b442..93e753bbd5 100644
--- a/composer/profiler/torch_profiler.py
+++ b/composer/profiler/torch_profiler.py
@@ -13,12 +13,12 @@
 
 import torch.cuda
 import torch.profiler
-from packaging import version
 from torch.profiler.profiler import ProfilerAction as TorchProfilerAction
 
 from composer.core.callback import Callback
 from composer.loggers import Logger
 from composer.profiler.profiler_action import ProfilerAction
+from composer.profiler.utils import export_memory_timeline_html
 from composer.utils import (
     FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
     FORMAT_NAME_WITH_DIST_TABLE,
@@ -296,44 +296,39 @@ def handler_fn(prof: torch.profiler.profiler.profile):
                 f'PyTorch memory timeline profiler enabled: {self.memory_filename if self.memory_filename else False}',
             )
             if self.memory_filename is not None:
-                if version.parse(torch.__version__) > version.parse('2.1.0.dev'):  # type: ignore
-                    # memory timeline profiling is only supported in torch v2.1.0-rc1 or higher
-                    memory_trace_file_name = os.path.join(
-                        folder_name,
-                        format_name_with_dist_and_time(
-                            self.memory_filename,
-                            run_name=state.run_name,
-                            timestamp=timestamp,
-                        ),
+                memory_trace_file_name = os.path.join(
+                    folder_name,
+                    format_name_with_dist_and_time(
+                        self.memory_filename,
+                        run_name=state.run_name,
+                        timestamp=timestamp,
+                    ),
+                )
+                log.debug(f'Saving memory trace to {memory_trace_file_name}')
+                memory_trace_file_dirname = os.path.dirname(memory_trace_file_name)
+                if memory_trace_file_dirname:
+                    os.makedirs(memory_trace_file_dirname, exist_ok=True)
+                export_memory_timeline_html(
+                    prof,
+                    memory_trace_file_name,
+                    torch.cuda.current_device(),  # type: ignore
+                )
+                log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}')
+                if self.memory_remote_file_name is not None:
+                    memory_trace_remote_file_name = format_name_with_dist_and_time(
+                        self.memory_remote_file_name,
+                        run_name=state.run_name,
+                        timestamp=timestamp,
+                    )
+                    memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/')
+                    log.debug(
+                        f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}',
                     )
-                    log.debug(f'Saving memory trace to {memory_trace_file_name}')
-                    memory_trace_file_dirname = os.path.dirname(memory_trace_file_name)
-                    if memory_trace_file_dirname:
-                        os.makedirs(memory_trace_file_dirname, exist_ok=True)
-                    from composer.profiler.utils import export_memory_timeline_html
-                    export_memory_timeline_html(
-                        prof,
-                        memory_trace_file_name,
-                        torch.cuda.current_device(),  # type: ignore
+                    logger.upload_file(
+                        remote_file_name=memory_trace_remote_file_name,
+                        file_path=memory_trace_file_name,
+                        overwrite=self.overwrite,
                     )
-                    log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}')
-                    if self.memory_remote_file_name is not None:
-                        memory_trace_remote_file_name = format_name_with_dist_and_time(
-                            self.memory_remote_file_name,
-                            run_name=state.run_name,
-                            timestamp=timestamp,
-                        )
-                        memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/')
-                        log.debug(
-                            f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}',
-                        )
-                        logger.upload_file(
-                            remote_file_name=memory_trace_remote_file_name,
-                            file_path=memory_trace_file_name,
-                            overwrite=self.overwrite,
-                        )
-                else:
-                    log.warning('Memory timeline is supported after PyTorch 2.1.0. Skipping memory trace.')
 
             if self.num_traces_to_keep >= 0:
                 while len(self.saved_traces) > self.num_traces_to_keep:
diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py
index ddd235b711..68f2862549 100644
--- a/composer/profiler/utils.py
+++ b/composer/profiler/utils.py
@@ -13,7 +13,7 @@
 import numpy as np
 import torch
 import torch.cuda
-from packaging import version
+from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
 from torch.profiler.profiler import profile as TorchProfile
 
 log = logging.getLogger(__name__)
@@ -29,12 +29,6 @@ def export_memory_timeline_html(
     return_fig: bool = False,
 ) -> Optional[Union[None, Any]]:
     """Exports a memory timeline to an HTML file. Similar to the PyTorch plotting function, but with adjusted axis tickers and grids."""
-    if version.parse(torch.__version__) <= version.parse('2.1.0.dev'):
-        log.warning('export_memory_timeline_html failed because memory timeline is supported after PyTorch 2.1.0.')
-        return
-
-    from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
-
     # Default to device 0, if unset. Fallback on cpu.
     if device is None and prof.use_device and prof.use_device != 'cuda':
         device = prof.use_device + ':0'
diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 2c27118090..881914e2ce 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -11,7 +11,6 @@
 """PyTorch, especially PyTorch Distributed, monkeypatches."""
 
 import logging
-import math
 import functools
 import contextlib
 from dataclasses import asdict
@@ -20,16 +19,9 @@
 
 
 import torch
-import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
-from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 import torch.nn as nn
-import torch.nn.functional as F
 from packaging import version
-from torch.distributed._shard.sharding_spec import ShardMetadata
-from torch.distributed._shard.sharding_spec._internals import get_chunked_dim_size, get_split_size
 from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
-from torch.distributed.fsdp._fsdp_extensions import _ext_pre_load_state_dict_transform
-from torch.distributed.utils import _replace_by_prefix
 
 from composer.utils import dist
 
@@ -47,29 +39,7 @@ def patch_unshard_for_automicrobatching(auto_microbatch_size_found=False):
 
 def patch_pytorch():
     """Monkey patches pytorch functions based on pytorch version."""
-    if version.parse(torch.__version__) < version.parse('2.1.1'):
-        # Monkey patch for torch < 2.1.1 ie torch == 2.1.0
-
-        # Monkey patch sharding method
-        ChunkShardingSpec.build_metadata = build_metadata
-
-        # Monkey patch partial state dict handling
-        from torch.distributed.fsdp import _state_dict_utils
-
-        _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook)
-
-        # Allow 2D HSDP
-        from torch.distributed.fsdp import _runtime_utils
-        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
-
-    elif version.parse(torch.__version__) < version.parse('2.1.3'):
-        # Monkey patch for torch < 2.1.3 ie torch == 2.1.1, 2.1.2
-
-        # Allow 2D HSDP
-        from torch.distributed.fsdp import _runtime_utils
-        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
-
-    elif version.parse(torch.__version__) < version.parse('2.2.1'):
+    if version.parse(torch.__version__) < version.parse('2.2.1'):
         # Monkey patch for torch < 2.2.1 ie torch == 2.2.0
 
         # Allow 2D HSDP
@@ -133,139 +103,11 @@ def patch_pytorch():
         _MeshEnv.create_child_mesh = create_child_mesh
         DeviceMesh.__getitem__ = device_mesh__getitem__
 
+    elif version.parse(torch.__version__) < version.parse('2.4.1'):
+        # Monkey patch for torch < 2.4.1 ie torch == 2.4.0
 
-def build_metadata(
-    self,
-    tensor_sizes: torch.Size,
-    tensor_properties: sharded_tensor_meta.TensorProperties,
-) -> sharded_tensor_meta.ShardedTensorMetadata:
-    """Adds nightly change for ChunkShardingSpec.
-
-    Change implemented in https://github.com/pytorch/pytorch/pull/108915
-    """
-    tensor_num_dim = len(tensor_sizes)
-
-    self._verify_dim(self.dim)
-    if self.dim >= tensor_num_dim or self.dim < -tensor_num_dim:  # type: ignore[operator]
-        raise ValueError(f'Invalid sharding dim: {self.dim}')
-
-    shards_metadata = []
-    sharding_dim_size = tensor_sizes[self.dim]  # type: ignore[index]
-    chunks = len(self.placements)
-    split_size = get_split_size(sharding_dim_size, chunks)
-    for idx, placement in enumerate(self.placements):
-        # generate ShardMetadata for each placement device
-        chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
-        shard_size = list(tensor_sizes)
-        current_offsets = [0] * tensor_num_dim
-        current_offsets[self.dim] = split_size * idx  # type: ignore[index]
-        shard_size[self.dim] = chunked_dim_size  # type: ignore[index]
-
-        shard_metadata = ShardMetadata(
-            shard_offsets=current_offsets,
-            shard_sizes=shard_size,
-            placement=placement,
-        )
-        shards_metadata.append(shard_metadata)
-
-    return sharded_tensor_meta.ShardedTensorMetadata(shards_metadata, tensor_sizes, tensor_properties)
-
-
-@no_type_check
-def _sharded_pre_load_state_dict_hook(
-    module: nn.Module,
-    fsdp_state,
-    state_dict: dict[str, Any],
-    prefix: str,
-) -> None:
-    """Adds nightly change for partial state dict error handling.
-
-    https://github.com/pytorch/pytorch/blob/0511df0ee9edeb5c2613805ccfb49beb323b87f9/torch/distributed/fsdp/_state_dict_utils.py#L607-L615
-
-    The hook combines the unflattened, sharded parameters (ShardedTensor) to
-    a new FlatParameter and shards the new FlatParameter to the local chunk.
-    """
-    from torch.distributed._tensor import Replicate
-    from torch.distributed.distributed_c10d import _get_pg_default_device
-    from torch.distributed.fsdp._common_utils import FSDP_PREFIX, _has_fsdp_params, _is_composable, _module_handle
-    from torch.distributed.fsdp._runtime_utils import _lazy_init
-    from torch.distributed.fsdp._state_dict_utils import _enter_unshard_params_ctx, _param_name_infos
-
-    _lazy_init(fsdp_state, module)
-    if not _is_composable(fsdp_state):
-        _replace_by_prefix(state_dict, prefix, prefix + f'{FSDP_PREFIX}')
-    if not _has_fsdp_params(fsdp_state, module):
-        return
-
-    handle = _module_handle(fsdp_state, module)
-    if not handle.uses_sharded_strategy:  # type: ignore
-        raise RuntimeError(
-            'load_sharded_state_dict can only be called when parameters '
-            'are flattened and sharded.',
-        )
-
-    device = fsdp_state.compute_device
-    for fqn, _, _ in _param_name_infos(module, fsdp_state):
-        if not _is_composable(fsdp_state):
-            fqn_from_global_root = f'{prefix}{FSDP_PREFIX}{fqn}'
-        else:
-            fqn_from_global_root = f'{prefix}{fqn}'
-        try:
-            param = state_dict.pop(fqn_from_global_root)
-        except KeyError:
-            log.warning(
-                f'Did not find param with FQN {fqn_from_global_root}, skipping it. '  # noqa: G004
-                'The weight will not be filled if you expect it to be.',
-            )
-            continue  # TODO: Improve unittesting for state_dict finetuning
-            # cases: https://github.com/pytorch/pytorch/issues/109134
-
-        if not fsdp_state._state_dict_config.use_dtensor:
-            # All-gather the param (ShardedTensor)
-            param, shards = _ext_pre_load_state_dict_transform(param)
-
-            assert len(shards) < 2, (
-                'Expects 0 or 1 shard per rank '
-                f'but got {len(shards)} shards on rank {fsdp_state.rank}.'
-            )
-            param_numel = param.size().numel()
-            dim_0_size = param.size()[0]
-            chunk_size = (math.ceil(dim_0_size / fsdp_state.world_size) * param_numel // dim_0_size)
-            if len(shards) == 1:
-                local_tensor = shards[0].tensor.flatten()
-                pg_device = _get_pg_default_device(fsdp_state.process_group)
-                if local_tensor.device.type != pg_device.type:
-                    local_tensor = local_tensor.to(pg_device)
-                num_padding = chunk_size - local_tensor.numel()
-                if num_padding > 0:
-                    local_tensor = F.pad(local_tensor, [0, num_padding])
-            else:
-                local_tensor = torch.zeros(chunk_size, dtype=param.dtype, device=device)
-            tensor = torch.empty(
-                chunk_size * fsdp_state.world_size,
-                dtype=local_tensor.dtype,
-                device=device,
-            )
-            if local_tensor.is_cpu:
-                # Tensor could be on FSDP GPU compute device, while local_tensor is on CPU.
-                # Convert to CPU so all_gather can work.
-                tensor_dev = tensor.device
-                tensor = tensor.cpu()
-                tensor_list = list(torch.chunk(tensor, torch.distributed.get_world_size(fsdp_state.process_group)))
-                torch.distributed.all_gather(tensor_list, local_tensor, group=fsdp_state.process_group)
-                tensor.to(tensor_dev)
-            else:
-                torch.distributed.all_gather_into_tensor(tensor, local_tensor, group=fsdp_state.process_group)
-            tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
-            state_dict[fqn_from_global_root] = tensor
-        else:
-            if param.device != fsdp_state._device_mesh.device_type:  # type: ignore
-                param = param.to(fsdp_state._device_mesh.device_type)  # type: ignore
-
-            param = param.redistribute(device_mesh=param.device_mesh, placements=[Replicate()])
-            state_dict[fqn_from_global_root] = param.to_local()
-
-    _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
+        # No monkeypatches!
+        pass
 
 
 if version.parse(torch.__version__) >= version.parse('2.2.1') and version.parse(
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index b2f829ca10..27323718fc 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -41,7 +41,6 @@
 import torch.utils.data
 from packaging import version
 from torch._dynamo import OptimizedModule
-from torch.cuda.amp.grad_scaler import GradScaler
 from torch.distributed.fsdp import FullyShardedDataParallel
 from torch.distributed.fsdp._runtime_utils import _post_backward_final_callback
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index 56b13fcac6..648290a320 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -692,14 +692,19 @@ def load_sharded_checkpoint(
                 # Ensure state exists
                 state_dict['state'] = state_dict.get('state', {})
 
-            # dist_cp.load breaks unless the specified state_dict supports `load_state_dict`
-            # See: https://github.com/pytorch/pytorch/issues/125096
-            dist_cp.load_state_dict(
-                state_dict=state_dict,
-                storage_reader=storage_reader,
-                planner=state.fsdp_config.load_planner,
-                no_dist=(not dist.is_initialized()),
-            )
+            if version.parse(torch.__version__) >= version.parse('2.4.0'):
+                dist_cp.load(
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config.load_planner,
+                )
+            else:
+                dist_cp.load_state_dict(
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config.load_planner,
+                    no_dist=(not dist.is_initialized()),
+                )
 
             log.info(f'Loaded state dict')
             state.load_state_dict(
@@ -1159,9 +1164,15 @@ def _save_checkpoint(
             if version.parse(torch.__version__) >= version.parse('2.3.0'):
                 save_planner = state.fsdp_config.save_planner
                 if save_planner is None:
-                    from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
+                    if version.parse(torch.__version__) < version.parse('2.4.0'):
+                        # Dedup is only broken on <2.4
+                        from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
+
+                        save_planner = SavePlannerWithDedupFix()
+                    else:
+                        from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
 
-                    save_planner = SavePlannerWithDedupFix()
+                        save_planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True)
                 dist_cp.save(
                     state_dict=state_dict,
                     storage_writer=dist_cp.FileSystemWriter(dirname),
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 2178ce2dd5..0515828a10 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -47,12 +47,8 @@
 import torch
 import torch.distributed as dist
 import torch.utils.data
-from packaging import version
 
-from composer.utils.device import get_device, is_hpu_installed, is_xla_installed
-
-if is_xla_installed():
-    import torch_xla
+from composer.utils.device import get_device, is_hpu_installed
 
 if TYPE_CHECKING:
     from composer.devices import Device
@@ -579,8 +575,6 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
                 'PyTorch XLA package not found. In order to use XLA based devices '
                 'PyTorch XLA must be installed.',
             )
-        if version.parse(torch_xla.__version__) < version.parse('2.1.0'):
-            raise RuntimeError(f'PyTorch XLA version must be at least 2.1.0, found {torch_xla.__version__}.')
         # XLA initialization requires the init_method to be set
         dist.init_process_group(device_obj.dist_backend, init_method='xla://')
     elif dist_env_vars_match_defaults:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 80ae8bad2e..c3f4dee907 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -188,9 +188,10 @@ ENV PYTORCH_NIGHTLY_URL=${PYTORCH_NIGHTLY_URL}
 ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
-      CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
-        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \
-            torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
+        CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch/ \
+            torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} && \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torchvision/ \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \
@@ -261,7 +262,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
 RUN if [ -n "$CUDA_VERSION" ] ; then \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \
-        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.2; \
+        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.3; \
         cd .. ; \
     fi
 
diff --git a/docker/README.md b/docker/README.md
index a0514ecb3d..09dd2591f5 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`       |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`       |
+| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                    |
+| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                |
+| Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                      |
 | Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws`                                |
 | Ubuntu 20.04   | Base     | 2.2.2             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04`                                      |
-| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.1 (Infiniband) | 3.10             | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04`                                    |
-| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.1 (EFA)        | 3.10             | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws`                                |
-| Ubuntu 20.04   | Base     | 2.1.2             | cpu                 | 3.10             | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04`                                      |
 <!-- END_PYTORCH_BUILD_MATRIX -->
 
 **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index ee74d12309..2fb084a78b 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -1,79 +1,53 @@
 # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT!
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-0-cu124
   MOFED_VERSION: latest-23.10
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121-aws
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-0-cu124-aws
   MOFED_VERSION: ''
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-3-1-cpu
+  IMAGE_NAME: torch-2-4-0-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121
+  IMAGE_NAME: torch-2-3-1-cu121
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -92,15 +66,15 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121-aws
+  IMAGE_NAME: torch-2-3-1-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -119,29 +93,29 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-2-2-cpu
+  IMAGE_NAME: torch-2-3-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-1-2-cu121
+  IMAGE_NAME: torch-2-2-2-cu121
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -157,18 +131,18 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-1-2-cu121-aws
+  IMAGE_NAME: torch-2-2-2-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -184,57 +158,44 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws
+  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-1-2-cpu
+  IMAGE_NAME: torch-2-2-2-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
-  CUDA_VERSION: 12.1.1
+  CUDA_VERSION: 12.4.1
   IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
   - mosaicml/composer:0.23.5
   - mosaicml/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
@@ -245,9 +206,9 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
   - mosaicml/composer:0.23.5_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index 74d9c7fed4..a1cf5bca3b 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -19,22 +19,24 @@
 import yaml
 
 PRODUCTION_PYTHON_VERSION = '3.11'
-PRODUCTION_PYTORCH_VERSION = '2.3.1'
+PRODUCTION_PYTORCH_VERSION = '2.4.0'
 
 
 def _get_torchvision_version(pytorch_version: str):
+    if pytorch_version == '2.4.0':
+        return '0.19.0'
     if pytorch_version == '2.3.1':
         return '0.18.1'
     if pytorch_version == '2.2.2':
         return '0.17.2'
-    if pytorch_version == '2.1.2':
-        return '0.16.2'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
 def _get_base_image(cuda_version: str):
     if not cuda_version:
         return 'ubuntu:20.04'
+    if cuda_version == '12.4.1':
+        return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04'
     return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04'
 
 
@@ -42,12 +44,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
+    if pytorch_version == '2.4.0':
+        return '12.4.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
     if pytorch_version == '2.2.2':
         return '12.1.1'
-    if pytorch_version == '2.1.2':
-        return '12.1.1'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -167,7 +169,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.3.1'), ('3.11', '2.2.2'), ('3.10', '2.1.2')]
+    python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS
diff --git a/setup.py b/setup.py
index 11c82b5a37..4dbf584c32 100644
--- a/setup.py
+++ b/setup.py
@@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
     'tqdm>=4.62.3,<5',
     'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.13.1,<0.18.2',
-    'torch>=2.1.2,<2.3.2',
+    'torchvision>=0.14.0,<0.19.1',
+    'torch>=2.2.0,<2.4.1',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.1.0',
     'psutil>=5.8.0,<7',
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index a59e60172a..5bdf76ce8a 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -315,12 +315,10 @@ def test_fsdp_full_state_dict_load(
     use_tp: bool,
     use_hsdp: bool,
 ):
-    if use_hsdp:
-        pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch')
+    if use_hsdp and version.parse(torch.__version__) < version.parse('2.4.0'):
+        pytest.xfail('HSDP requires torch 2.4.0 or later')
     if use_tp:
         pytest.skip('TP on PyTorch 2.3 has full state dict issues.')
-    if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
-        pytest.skip('HSDP and TP require torch 2.3.0 or later')
     if autoresume:
         run_name = 'my-cool-autoresume-run'
     else:
@@ -1153,7 +1151,10 @@ def set_up_planner(
             # suffix all keys with `foo_``
             state_dict['state']['model'] = {k + '_foo': v for k, v in state_dict['state']['model'].items()}
 
-            super().set_up_planner(state_dict, is_coordinator)
+            super().set_up_planner(
+                state_dict=state_dict,
+                is_coordinator=is_coordinator,
+            )
 
     class RenameLoadPlanner(DefaultLoadPlanner):
 
@@ -1164,7 +1165,11 @@ def set_up_planner(
             is_coordinator: bool,
         ) -> None:
             if 'state' not in state_dict:
-                super().set_up_planner(state_dict, metadata, is_coordinator)
+                super().set_up_planner(
+                    state_dict=state_dict,
+                    metadata=metadata,
+                    is_coordinator=is_coordinator,
+                )
                 return
 
             self.original_state_dict = state_dict
diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
index e7c374377d..69b78ead4c 100644
--- a/tests/utils/test_inference.py
+++ b/tests/utils/test_inference.py
@@ -196,7 +196,7 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con
         ort_session = ort.InferenceSession(save_path, providers=['CPUExecutionProvider'])
 
         for key, value in sample_input.items():
-            sample_input[key] = cpu_device.tensor_to_device(value).numpy()
+            sample_input[key] = cpu_device.tensor_to_device(value).numpy()  # type: ignore
 
         loaded_model_out = ort_session.run(None, sample_input)
 

From 6664382d9a2f776ab887f139000a5491d5ec5785 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Mon, 12 Aug 2024 18:29:41 -0400
Subject: [PATCH 12/12] Use python 3.11 in GAs (#3529)

---
 .github/workflows/daily.yaml  | 2 +-
 .github/workflows/pr-gpu.yaml | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index ee94e89c2b..5552d6c19c 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -139,7 +139,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: ${{ matrix.gpu_num }}
       gha-timeout: 5400
     secrets:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 392a2665c8..2f335a5a68 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -29,7 +29,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 1
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -55,7 +55,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 2
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -82,7 +82,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 4
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}