From 454faa8e13529b4688fddfd72ae4d82c64b82e99 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 7 Dec 2023 17:18:01 -0500
Subject: [PATCH 1/3] Disable mosaicml logger in foundry CI/CD (#788)

* logs

* change to env var

* lint

* lint
---
 .github/mcp/mcp_pytest.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
index b6d74880c8..7571200d8d 100644
--- a/.github/mcp/mcp_pytest.py
+++ b/.github/mcp/mcp_pytest.py
@@ -114,6 +114,16 @@
         integrations=[git_integration],
         command=command,
         scheduling={'max_duration': args.timeout / 60 / 60},
+        env_variables=[
+            {
+                'key': 'MOSAICML_PLATFORM',
+                'value': 'False',
+            },
+            {
+                'key': 'PYTHONUNBUFFERED',
+                'value': '1',
+            },
+        ],
     )
 
     # Create run

From ef60e8eeeae17e158bce96e85e07345e4e647280 Mon Sep 17 00:00:00 2001
From: Prithviraj Ammanabrolu <rajammanabrolu@gmail.com>
Date: Thu, 7 Dec 2023 17:08:28 -0800
Subject: [PATCH 2/3] tiktoken chat formatting (#784)

---
 llmfoundry/tokenizers/tiktoken.py | 28 +++++++--
 tests/tokenizers/test_tiktoken.py | 95 +++++++++++++++++++++++++------
 2 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 6110f565df..fa0e148ff3 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -173,12 +173,30 @@ def default_chat_template(self):
         Pinning default Chat ML template in case defaults change.
         """
         template = (
-            "{% set system_message = '' %}"
-            '{% if USE_DEFAULT_PROMPT == true %}'
-            "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
+            "{% if messages[0]['role'] == 'system' %}"
+            '{% set loop_messages = messages[1:] %}'
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
+            '{% set loop_messages = messages %}'
+            "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
+            '{% else %}'
+            '{% set loop_messages = messages %}'
+            '{% set system_message = false %}'
+            '{% endif %}'
+            '{% for message in loop_messages %}'
+            '{% if loop.index0 == 0 %}'
+            '{% if system_message != false %}'
+            "{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}"
+            '{% endif %}'
+            "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+            '{% else %}'
+            "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+            '{% endif %}'
+            '{% if (add_generation_prompt == true) %}'
+            "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
+            "{% elif (message['role'] == 'assistant') %}"
+            '{{ eos_token }}'
             '{% endif %}'
-            '{% for message in messages %}'
-            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
             '{% endfor %}')
         template = template.replace(
             'USE_DEFAULT_PROMPT',
diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
index 60907092c8..1ade2ea156 100644
--- a/tests/tokenizers/test_tiktoken.py
+++ b/tests/tokenizers/test_tiktoken.py
@@ -39,28 +39,88 @@
     ('gpt2', None),
 ]
 
-MULTI_TURN_CHAT_ML = [[{
+DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
+
+MULTI_TURN_CHAT_ML = [
+    [{
+        'content':
+            'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
+        'role':
+            'user'
+    }, {
+        'content': 'You should go outside and touch grass.',
+        'role': 'assistant'
+    }],
+    [{
+        'content':
+            'You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.',
+        'role':
+            'system'
+    }, {
+        'content':
+            'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
+        'role':
+            'user'
+    }, {
+        'content': 'You should go outside and touch grass.',
+        'role': 'assistant'
+    }]
+]
+
+MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT = [
+    """<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>""",
+    """<|im_start|>system
+You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>"""
+]
+
+MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT = [
+    """<|im_start|>system
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>""",
+    """<|im_start|>system
+You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>"""
+]
+
+MULTI_TURN_GENERATE_CHAT_ML = [[{
     'content':
         'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
     'role':
         'user'
-}, {
-    'content': 'You should go outside and touch grass.',
-    'role': 'assistant'
 }]]
 
-MULTI_TURN_CHAT_STRING = [
-    """<|im_start|>user
+MULTI_TURN_GENERATE_STRING = [
+    """<|im_start|>system
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+<|im_start|>user
 Please summarize the goals in this text:
 
 Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
 <|im_start|>assistant
-You should go outside and touch grass.<|im_end|>
 """
 ]
 
-DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
-
 
 def get_tokenizers_for_testing(
     model_name: Optional[str],
@@ -306,10 +366,9 @@ def test_chat_formatting(model_name: Optional[str],
         add_eos_token=False,
         additional_special_tokens=special_tokens_to_add)
     for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
-        chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
-                                                         tokenize=False)
-        assert chat_str == MULTI_TURN_CHAT_STRING[i]
-
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=False)
+        assert chat_str == MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT[i]
     # Using default system prompt.
     wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
         model_name,
@@ -320,6 +379,10 @@ def test_chat_formatting(model_name: Optional[str],
         add_eos_token=False,
         additional_special_tokens=special_tokens_to_add)
     for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
-        chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
-                                                         tokenize=False)
-        assert chat_str == DEFAULT_SYSTEM_PROMPT + MULTI_TURN_CHAT_STRING[i]
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=False)
+        assert chat_str == MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT[i]
+    for i, dict_chats in enumerate(MULTI_TURN_GENERATE_CHAT_ML):
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=True)
+        assert chat_str == MULTI_TURN_GENERATE_STRING[i]

From 2017c028225a1ba483e6643937107166831682c7 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:36:52 -0800
Subject: [PATCH 3/3] Remove tests and support for torch <2.1 (#787)

---
 .github/workflows/docker.yaml                    |  6 ------
 .github/workflows/pr-cpu.yaml                    |  4 ----
 .github/workflows/pr-gpu.yaml                    |  5 -----
 README.md                                        | 15 ++-------------
 llmfoundry/data/packing.py                       |  3 +--
 mcli/mcli-hf-eval.yaml                           |  2 +-
 mcli/mcli-openai-eval.yaml                       |  2 +-
 setup.py                                         |  2 +-
 .../inference/test_convert_composer_to_hf.py     |  4 ++--
 tests/a_scripts/train/test_train.py              |  5 ++---
 tests/data/test_dataloader.py                    |  8 ++++----
 tests/data/test_packing.py                       |  4 ++--
 tests/models/test_fsdp_act_checkpoint.py         |  5 +----
 tests/optim/test_lion8b.py                       | 16 +++-------------
 14 files changed, 20 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index f6dac79fe5..bb538dbe9b 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -17,12 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: '1.13.1_cu117'
-          base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          dep_groups: '[gpu]'
-        - name: '2.0.1_cu118'
-          base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
-          dep_groups: '[gpu]'
         - name: '2.1.0_cu121'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           dep_groups: '[gpu]'
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index f57362ac82..c5bb4d641c 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -19,10 +19,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu-1.13.1'
-          container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-          markers: 'not gpu'
-          pytest_command: 'coverage run -m pytest'
         - name: 'cpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
           markers: 'not gpu'
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 87ae173e77..7d9320f9e0 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -19,11 +19,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'gpu-1.13.1'
-          container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-          deps_group: 'all'
         - name: 'gpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'
diff --git a/README.md b/README.md
index 4a4e60e844..f7b5148cf6 100644
--- a/README.md
+++ b/README.md
@@ -85,21 +85,14 @@ Something missing? Contribute with a PR!
 
 
 # Hardware and Software Requirements
-This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems with NVIDIA A100s and H100s.
+This codebase has been tested with PyTorch 2.1 with NVIDIA A100s and H100s.
 This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems.
 If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix!
 
 | Device         | Torch Version | Cuda Version | Status                       |
 | -------------- | ------------- | ------------ | ---------------------------- |
-| A100-40GB/80GB | 1.13.1        | 11.7         | :white_check_mark: Supported |
-| A100-40GB/80GB | 2.0.1         | 11.7, 11.8   | :white_check_mark: Supported |
-| A100-40GB/80GB | 2.1.0         | 11.8, 12.1   | :white_check_mark: Supported |
-| H100-80GB      | 1.13.1        | 11.7         | :x: Not Supported            |
-| H100-80GB      | 2.0.1         | 11.8         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.1.0         | 12.1         | :white_check_mark: Supported |
 | H100-80GB      | 2.1.0         | 12.1         | :white_check_mark: Supported |
-| A10-24GB       | 1.13.1        | 11.7         | :construction: In Progress   |
-| A10-24GB       | 2.0.1         | 11.7, 11.8   | :construction: In Progress   |
-| MI250          | 2.0.1         | ROCm 5.4     | :construction: In Progress   |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
@@ -113,11 +106,7 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 
 | Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
 | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
-| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1        | 11.7 (Infiniband) | No                                  |
-| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`  | 2.0.1         | 11.8 (Infiniband) | No                                  |
 | `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
-| `mosaicml/llm-foundry:1.13.1_cu117-latest`             | 1.13.1        | 11.7 (Infiniband) | Yes                                 |
-| `mosaicml/llm-foundry:2.0.1_cu118-latest`              | 2.0.1         | 11.8 (Infiniband) | Yes                                 |
 | `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
 | `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
 | `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 3fca0ade5e..45322c9b2f 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import torch
-from composer.utils import using_torch_2
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizerBase
 
@@ -348,7 +347,7 @@ def profile_packing(
     dataloader_cfg.dataset.packing_ratio = None
     dataloader_cfg.drop_last = False
     dataloader_cfg.num_workers = 0
-    dataloader_cfg.prefetch_factor = None if using_torch_2() else 2
+    dataloader_cfg.prefetch_factor = None
     dataloader_cfg.persistent_workers = False
 
     # Determine the packing_ratio values we'll try
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 8c91dac1b4..b330ff6ec1 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -16,7 +16,7 @@ gpu_num: 8
 # gpu_type:
 # cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.0.1_cu118-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index e7c26d7ae6..179b078fb6 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -16,7 +16,7 @@ run_name: openai-eval
 # gpu_type: #
 cluster: # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.0.1_cu118-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/setup.py b/setup.py
index 9bf2ef2cb0..a228105a4c 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
     'transformers>=4.34.1,<4.35',
     'mosaicml-streaming>=0.7.1,<0.8',
-    'torch>=1.13.1,<2.1.1',
+    'torch>=2.1,<2.1.1',
     'datasets>=2.14.5,<2.15',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.1.97',
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 5c3d0f1830..94a2d66c6e 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -14,7 +14,7 @@
 import transformers
 from composer import Trainer
 from composer.loggers import MLFlowLogger
-from composer.utils import dist, get_device, using_torch_2
+from composer.utils import dist, get_device
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.utils.data import DataLoader
@@ -497,7 +497,7 @@ def test_huggingface_conversion_callback(
         'drop_last': False,
         'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': None if using_torch_2() else 2,
+        'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0
     }
diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py
index 62075383cc..a26b0c1879 100644
--- a/tests/a_scripts/train/test_train.py
+++ b/tests/a_scripts/train/test_train.py
@@ -6,7 +6,6 @@
 
 import pytest
 from composer.loggers import InMemoryLogger
-from composer.utils import using_torch_2
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 
@@ -36,10 +35,10 @@ def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path):
     test_cfg.icl_subset_num_batches = 1
     test_cfg.eval_subset_num_batches = 2
     test_cfg.train_loader.num_workers = 0
-    test_cfg.train_loader.prefetch_factor = None if using_torch_2() else 2
+    test_cfg.train_loader.prefetch_factor = None
     test_cfg.train_loader.persistent_workers = False
     test_cfg.eval_loader.num_workers = 0
-    test_cfg.eval_loader.prefetch_factor = None if using_torch_2() else 2
+    test_cfg.eval_loader.prefetch_factor = None
     test_cfg.eval_loader.persistent_workers = False
 
     test_cfg.eval_gauntlet = DictConfig({
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index 747021e82a..728376229b 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -13,7 +13,7 @@
 import pytest
 import torch
 import transformers
-from composer.utils import dist, using_torch_2
+from composer.utils import dist
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from streaming import MDSWriter
@@ -272,7 +272,7 @@ def test_finetuning_dataloader(decoder_only_format: bool,
         'drop_last': False,
         'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': None if using_torch_2() else 2,
+        'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0
     }
@@ -569,7 +569,7 @@ def test_malformed_data(
         },
         'drop_last': False,
         'num_workers': 0,
-        'prefetch_factor': None if using_torch_2() else 2,
+        'prefetch_factor': None,
         'pin_memory': False,
         'persistent_workers': False,
         'timeout': 0
@@ -679,7 +679,7 @@ def test_token_counting_func_dataloader_setting(
     common_args = {
         'drop_last': False,
         'num_workers': 0,
-        'prefetch_factor': None if using_torch_2() else 2,
+        'prefetch_factor': None,
         'pin_memory': False,
         'persistent_workers': False,
         'timeout': 0
diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py
index 73453b6782..a86d88f360 100644
--- a/tests/data/test_packing.py
+++ b/tests/data/test_packing.py
@@ -6,7 +6,7 @@
 
 import pytest
 import torch
-from composer.utils import dist, reproducibility, using_torch_2
+from composer.utils import dist, reproducibility
 from omegaconf import DictConfig
 from pytest import approx
 from torch.utils.data import DataLoader
@@ -172,7 +172,7 @@ def test_packing_with_dataloader(packing_ratio: Any):
         # Gets copied per worker and we cannot check the waste for child processes.
         'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': None if using_torch_2() else 2,
+        'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0,
     })
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index a7e41a3fc2..987ea5f2a7 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -5,7 +5,7 @@
 
 import pytest
 from composer import Trainer
-from composer.utils import get_device, using_torch_2
+from composer.utils import get_device
 from omegaconf import OmegaConf as om
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \
     CheckpointWrapper
@@ -65,9 +65,6 @@ def test_fsdp_act_checkpoint(activation_checkpointing: bool,
          ]:
         module = trainer.state.model.model._fsdp_wrapped_module.transformer.blocks[
             0]._fsdp_wrapped_module
-        if not using_torch_2():
-            module = trainer.state.model.model._fsdp_wrapped_module.transformer.blocks[
-                0]._fsdp_wrapped_module._fpw_module
         assert isinstance(module, CheckpointWrapper)
     elif activation_checkpointing_target == [
             'grouped_query_attention'
diff --git a/tests/optim/test_lion8b.py b/tests/optim/test_lion8b.py
index d5b284b23c..b421c6d250 100644
--- a/tests/optim/test_lion8b.py
+++ b/tests/optim/test_lion8b.py
@@ -6,23 +6,15 @@
 import warnings
 
 import numpy as np
-import packaging.version as version
 import pytest
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed import fsdp
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-if version.parse(torch.__version__) >= version.parse('2.0.1'):
-    from torch.distributed.fsdp.api import (  # type:ignore .api not in public API
-        FullOptimStateDictConfig, LocalOptimStateDictConfig,
-        ShardedOptimStateDictConfig)
-else:
-    from unittest.mock import MagicMock  # for pyright so vars aren't None
-    FullOptimStateDictConfig = MagicMock()
-    LocalOptimStateDictConfig = MagicMock()
-    ShardedOptimStateDictConfig = MagicMock()
+from torch.distributed.fsdp.api import (  # type:ignore .api not in public API
+    FullOptimStateDictConfig, LocalOptimStateDictConfig,
+    ShardedOptimStateDictConfig)
 
 from llmfoundry.optim import DecoupledLionW
 from llmfoundry.optim import DecoupledLionW_8bit as Lion8bit
@@ -420,8 +412,6 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool,
     device = 'cuda'
     if torch.cuda.device_count() < 2:
         pytest.skip(f'This test requires 2+ GPUs.')
-    if version.parse(torch.__version__) < version.parse('2.0.1'):
-        pytest.skip(f'This test requires torch 2.0.1 or greater.')
 
     torch.cuda.set_device(f'cuda:{os.environ["RANK"]}')  # needed for fsdp
     if not dist.is_initialized():