diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 3c6d9425f7..7b4ef1e058 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -13,7 +13,7 @@
 import pytest
 import torch
 import transformers
-from composer import Trainer
+from composer import ComposerModel, Trainer
 from composer.loggers import MLFlowLogger
 from composer.utils import dist, get_device
 from omegaconf import DictConfig
@@ -29,6 +29,14 @@
 from scripts.inference.convert_composer_to_hf import convert_composer_to_hf
 from tests.data_utils import make_tiny_ft_dataset
 
+_OPTIMIZER_CFG = lambda: {
+    'name': 'decoupled_adamw',
+    'lr': 6e-4,
+    'betas': [0.9, 0.95],
+    'eps': 1e-8,
+    'weight_decay': 0.0,
+}
+
 
 def _save_model_mock(*args: Any, path: str, **kwargs: Any):
     os.makedirs(path, exist_ok=True)
@@ -309,13 +317,7 @@ def test_huggingface_conversion_callback_interval(
 
     original_model = build_tiny_mpt()
 
-    optimizer_config = {
-        'name': 'decoupled_adamw',
-        'lr': 6e-4,
-        'betas': [0.9, 0.95],
-        'eps': 1e-8,
-        'weight_decay': 0.0,
-    }
+    optimizer_config = _OPTIMIZER_CFG()
     optimizer_name = optimizer_config.pop('name')
     optimizer = build_optimizer(original_model, optimizer_name,
                                 optimizer_config)
@@ -400,68 +402,8 @@ def test_huggingface_conversion_callback_interval(
     delete_transformers_cache()
 
 
-@pytest.mark.world_size(2)
-@pytest.mark.gpu
-@pytest.mark.parametrize(
-    'model,tie_word_embeddings,peft_config',
-    [
-        ('mpt', True, None),
-        ('mpt', False, None),
-        ('neo', None, None),
-        ('llama2', None, None),
-        ('llama2', None, {
-            'peft_type': 'LORA',
-            'task_type': 'CAUSAL_LM',
-            'lora_alpha': 32,
-            'lora_dropout': 0.05,
-            'r': 16,
-            'target_modules': [
-                'q_proj',
-                'k_proj',
-                'v_proj',
-            ],
-        }),
-    ],
-)
-@pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
-@pytest.mark.parametrize(
-    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
-    [('1ba', '1ba', '1ba', 1, 1)])
-@patch('os.cpu_count', MagicMock(return_value=1))
-@patch('llmfoundry.callbacks.hf_checkpointer.SpawnProcess',
-       new=MockSpawnProcess)
-def test_huggingface_conversion_callback(
-    model: str,
-    tmp_path: pathlib.Path,
-    tie_word_embeddings: bool,
-    fsdp_state_dict_type: Optional[str],
-    hf_save_interval: str,
-    save_interval: str,
-    max_duration: str,
-    expected_hf_checkpoints: int,
-    expected_normal_checkpoints: int,
-    peft_config: Optional[dict],
-):
-    delete_transformers_cache()
-
-    dist.initialize_dist(get_device('gpu'))
-
-    max_seq_len = 16
-    device_batch_size = 1
-    dataset_size = 2
-    precision_str = 'bfloat16'
-    precision = torch.bfloat16
-    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
-
-    checkpointer_callback = HuggingFaceCheckpointer(
-        save_folder=os.path.join(tmp_path, 'checkpoints'),
-        save_interval=hf_save_interval,
-        precision=precision_str,
-        mlflow_registered_model_name='dummy-registered-name')
-
-    # get small version of each model
-    model_cfg = None
-    tokenizer_name = None
+def _get_model_and_tokenizer(model: str, max_seq_len: int,
+                             tie_word_embeddings: bool):
     if model == 'mpt':
         model_cfg = {
             'name': 'mpt_causal_lm',
@@ -513,12 +455,43 @@ def test_huggingface_conversion_callback(
         tokenizer_name = 'meta-llama/Llama-2-7b-hf'
     else:
         raise ValueError(f'Unknown model {model}')
-    assert model_cfg is not None
-    assert tokenizer_name is not None
-    model_cfg = om.create(model_cfg)
-    if peft_config is not None:
-        model_cfg['peft_config'] = peft_config
+    return model_cfg, tokenizer_name
+
+
+def _assert_mlflow_logger_calls(mlflow_logger_mock: MagicMock,
+                                peft_config: Optional[dict] = None):
+    if dist.get_global_rank() == 0:
+        assert mlflow_logger_mock.save_model.call_count == 1
+        if peft_config is not None:
+            expectation = {
+                'flavor': 'peft',
+                'path': ANY,
+                'save_pretrained_dir': ANY,
+                'metadata': {},
+            }
+        else:
+            import numpy as np
 
+            default_input_example = {
+                'prompt': np.array(['What is Machine Learning?'])
+            }
+
+            expectation = {
+                'flavor': 'transformers',
+                'transformers_model': ANY,
+                'path': ANY,
+                'task': 'llm/v1/completions',
+                'input_example': default_input_example,
+                'metadata': {}
+            }
+        mlflow_logger_mock.save_model.assert_called_with(**expectation)
+        assert mlflow_logger_mock.register_model_with_run_id.call_count == 1
+    else:
+        assert mlflow_logger_mock.log_model.call_count == 0
+        assert mlflow_logger_mock.register_model_with_run_id.call_count == 0
+
+
+def _get_fsdp_config(fsdp_state_dict_type: Optional[str]):
     fsdp_config = {
         'sharding_strategy': 'FULL_SHARD',
         'mixed_precision': 'PURE',
@@ -528,12 +501,10 @@ def test_huggingface_conversion_callback(
         'limit_all_gathers': True,
         'state_dict_type': fsdp_state_dict_type,
     }
+    return fsdp_config
 
-    tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
-    tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
-    if dist.get_global_rank() == 0:
-        make_tiny_ft_dataset(path=tiny_dataset_path, size=dataset_size)
 
+def _get_dataloader_cfg(tiny_dataset_folder_path: str, max_seq_len: int):
     dataloader_cfg = {
         'name': 'finetuning',
         'dataset': {
@@ -552,6 +523,198 @@ def test_huggingface_conversion_callback(
         'persistent_workers': False,
         'timeout': 0
     }
+    return dataloader_cfg
+
+
+def _assert_checkpoint_equivalence(tmp_path: pathlib.Path,
+                                   expected_normal_checkpoints: int,
+                                   expected_hf_checkpoints: int,
+                                   trainer: Trainer,
+                                   batches_per_epoch: int,
+                                   precision: torch.dtype,
+                                   model: str,
+                                   tokenizer: PreTrainedTokenizerBase,
+                                   original_model: ComposerModel,
+                                   fsdp_state_dict_type: Optional[str] = None,
+                                   peft_config: Optional[dict] = None):
+    """Asserts the equivalence of checkpoints.
+
+    Asserts equivalence of checkpoints between the original mpt model and the converted hf model.
+
+    Args:
+        tmp_path (str): The path to the temporary directory where the checkpoints are saved.
+        expected_normal_checkpoints (int): The expected number of normal checkpoints.
+        expected_hf_checkpoints (int): The expected number of HuggingFace checkpoints.
+        trainer (Trainer): The trainer object used for training the model.
+        batches_per_epoch (int): The number of batches per epoch.
+        precision (torch.dtype): The precision of the model.
+        model (str): The type of model ('mpt', 'neo', or 'llama2').
+        tokenizer (PreTrainedTokenizerBase): The model tokenizer.
+        original_model (ComposerModel): The original model object.
+        fsdp_state_dict_type (Optional[str], optional): The type of FSDP state dict. Defaults to None.
+        peft_config (Optional[dict], optional): The PEFT configuration. Defaults to None.
+    """
+    loaded_model = None
+    loaded_tokenizer = None
+    # Only rank zero is saving the huggingface checkpoints, so only check
+    # for equivalence on rank zero
+    if dist.get_global_rank() == 0:
+        normal_checkpoints = [
+            name for name in os.listdir(os.path.join(tmp_path, 'checkpoints'))
+            if name != 'huggingface'
+        ]
+        huggingface_checkpoints = [
+            name for name in os.listdir(
+                os.path.join(tmp_path, 'checkpoints', 'huggingface'))
+        ]
+
+        checkpoint_files = os.listdir(
+            os.path.join(tmp_path, 'checkpoints', 'huggingface',
+                         huggingface_checkpoints[-1]))
+        if peft_config is not None:
+            assert 'adapter_config.json' in checkpoint_files
+            assert 'adapter_model.safetensors' in checkpoint_files
+
+        assert len(normal_checkpoints) == expected_normal_checkpoints
+        assert len(huggingface_checkpoints) == expected_hf_checkpoints
+
+        # Patch flash_attn package to be empty to simulate loading the model in
+        # an environment without flash attention installed
+        with patch.dict('sys.modules', {'flash_attn': None}):
+            if peft_config is not None:
+                composer_model = trainer.state.model.module if trainer.state.is_model_ddp else trainer.state.model
+                composer_model.model.base_model.save_pretrained(tmp_path /
+                                                                'base-model')
+
+            checkpoint_path = os.path.join(tmp_path, 'checkpoints',
+                                           'huggingface',
+                                           f'ba{batches_per_epoch}')
+
+            if peft_config is not None:
+                with open(os.path.join(checkpoint_path,
+                                       'adapter_config.json')) as _f:
+                    adapter_config = json.load(_f)
+
+                adapter_config['base_model_name_or_path'] = str(tmp_path /
+                                                                'base-model')
+
+                with open(os.path.join(checkpoint_path, 'adapter_config.json'),
+                          'w') as _f:
+                    json.dump(adapter_config, _f)
+
+            # Load the last huggingface checkpoint
+            loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
+                checkpoint_path,
+                trust_remote_code=True,
+            )
+
+        # Check that the loaded model has the correct precision, and then set it back
+        # to the original for the equivalence check
+        if peft_config is None:
+            assert loaded_model.config.torch_dtype == precision
+            loaded_model.config.torch_dtype = original_model.model.config.torch_dtype
+
+        if model == 'mpt':
+            # Check that we have correctly set these attributes, and then set them back
+            # to the original for the equivalence check
+            assert loaded_model.config.attn_config['attn_impl'] == 'torch'
+            assert loaded_model.config.init_device == 'cpu'
+            loaded_model.config.attn_config[
+                'attn_impl'] = original_model.model.config.attn_config[
+                    'attn_impl']
+            loaded_model.config.init_device = original_model.model.config.init_device
+
+        loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
+            os.path.join(tmp_path, 'checkpoints', 'huggingface',
+                         f'ba{batches_per_epoch}'),
+            trust_remote_code=True,
+        )
+
+        check_hf_model_equivalence(
+            trainer.state.model.model.to(precision) if fsdp_state_dict_type
+            is not None else trainer.state.model.module.model.to(precision),
+            loaded_model,
+            just_lora=peft_config is not None)
+        check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer)
+
+
+@pytest.mark.world_size(2)
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'model,tie_word_embeddings,peft_config',
+    [
+        ('mpt', True, None),
+        ('mpt', False, None),
+        ('neo', None, None),
+        ('llama2', None, None),
+        ('llama2', None, {
+            'peft_type': 'LORA',
+            'task_type': 'CAUSAL_LM',
+            'lora_alpha': 32,
+            'lora_dropout': 0.05,
+            'r': 16,
+            'target_modules': [
+                'q_proj',
+                'k_proj',
+                'v_proj',
+            ],
+        }),
+    ],
+)
+@pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
+@pytest.mark.parametrize(
+    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
+    [('1ba', '1ba', '1ba', 1, 1)])
+@patch('os.cpu_count', MagicMock(return_value=1))
+@patch('llmfoundry.callbacks.hf_checkpointer.SpawnProcess',
+       new=MockSpawnProcess)
+def test_huggingface_conversion_callback(
+    model: str,
+    tmp_path: pathlib.Path,
+    tie_word_embeddings: bool,
+    fsdp_state_dict_type: Optional[str],
+    hf_save_interval: str,
+    save_interval: str,
+    max_duration: str,
+    expected_hf_checkpoints: int,
+    expected_normal_checkpoints: int,
+    peft_config: Optional[dict],
+):
+    delete_transformers_cache()
+
+    dist.initialize_dist(get_device('gpu'))
+
+    max_seq_len = 16
+    device_batch_size = 1
+    dataset_size = 2
+    precision_str = 'bfloat16'
+    precision = torch.bfloat16
+    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
+
+    checkpointer_callback = HuggingFaceCheckpointer(
+        save_folder=os.path.join(tmp_path, 'checkpoints'),
+        save_interval=hf_save_interval,
+        precision=precision_str,
+        mlflow_registered_model_name='dummy-registered-name')
+
+    # get small version of each model
+    model_cfg, tokenizer_name = _get_model_and_tokenizer(
+        model, max_seq_len, tie_word_embeddings)
+    assert model_cfg is not None
+    assert tokenizer_name is not None
+    model_cfg = om.create(model_cfg)
+    if peft_config is not None:
+        model_cfg['peft_config'] = peft_config
+
+    fsdp_config = _get_fsdp_config(fsdp_state_dict_type)
+    optimizer_config = _OPTIMIZER_CFG()
+
+    tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
+    tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
+    if dist.get_global_rank() == 0:
+        make_tiny_ft_dataset(path=tiny_dataset_path, size=dataset_size)
+
+    dataloader_cfg = _get_dataloader_cfg(tiny_dataset_folder_path, max_seq_len)
 
     dataloader_cfg = om.create(dataloader_cfg)
 
@@ -566,19 +729,8 @@ def test_huggingface_conversion_callback(
         device_batch_size,
     )
 
-    original_model = build_composer_model(
-        name=model_cfg['name'],
-        cfg=model_cfg,
-        tokenizer=tokenizer,
-    )
-
-    optimizer_config = {
-        'name': 'decoupled_adamw',
-        'lr': 6e-4,
-        'betas': [0.9, 0.95],
-        'eps': 1e-8,
-        'weight_decay': 0.0,
-    }
+    original_model = build_composer_model(model_cfg['name'], model_cfg,
+                                          tokenizer)
     optimizer_name = optimizer_config.pop('name')
     optimizer = build_optimizer(original_model, optimizer_name,
                                 optimizer_config)
@@ -605,126 +757,25 @@ def test_huggingface_conversion_callback(
     )
     trainer.fit()
 
-    if dist.get_global_rank() == 0:
-        assert mlflow_logger_mock.save_model.call_count == 1
-        if peft_config is not None:
-            expectation = {
-                'flavor': 'peft',
-                'path': ANY,
-                'save_pretrained_dir': ANY,
-                'metadata': {},
-            }
-        else:
-            import numpy as np
-
-            default_input_example = {
-                'prompt': np.array(['What is Machine Learning?'])
-            }
-
-            expectation = {
-                'flavor': 'transformers',
-                'transformers_model': ANY,
-                'path': ANY,
-                'task': 'llm/v1/completions',
-                'input_example': default_input_example,
-                'metadata': {}
-            }
-        mlflow_logger_mock.save_model.assert_called_with(**expectation)
-        assert mlflow_logger_mock.register_model_with_run_id.call_count == 1
-    else:
-        assert mlflow_logger_mock.log_model.call_count == 0
-        assert mlflow_logger_mock.register_model_with_run_id.call_count == 0
+    _assert_mlflow_logger_calls(mlflow_logger_mock, peft_config)
 
     # summon full params to check equivalence
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     with FSDP.summon_full_params(trainer.state.model,
                                  writeback=False,
                                  recurse=True):
-        loaded_model = None
-        loaded_tokenizer = None
-        # Only rank zero is saving the huggingface checkpoints, so only check
-        # for equivalence on rank zero
-        if dist.get_global_rank() == 0:
-            normal_checkpoints = [
-                name
-                for name in os.listdir(os.path.join(tmp_path, 'checkpoints'))
-                if name != 'huggingface'
-            ]
-            huggingface_checkpoints = [
-                name for name in os.listdir(
-                    os.path.join(tmp_path, 'checkpoints', 'huggingface'))
-            ]
-
-            checkpoint_files = os.listdir(
-                os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             huggingface_checkpoints[-1]))
-            if peft_config is not None:
-                assert 'adapter_config.json' in checkpoint_files
-                assert 'adapter_model.safetensors' in checkpoint_files
-
-            assert len(normal_checkpoints) == expected_normal_checkpoints
-            assert len(huggingface_checkpoints) == expected_hf_checkpoints
-
-            # Patch flash_attn package to be empty to simulate loading the model in
-            # an environment without flash attention installed
-            with patch.dict('sys.modules', {'flash_attn': None}):
-                if peft_config is not None:
-                    composer_model = trainer.state.model.module if trainer.state.is_model_ddp else trainer.state.model
-                    composer_model.model.base_model.save_pretrained(
-                        tmp_path / 'base-model')
-
-                checkpoint_path = os.path.join(tmp_path, 'checkpoints',
-                                               'huggingface',
-                                               f'ba{batches_per_epoch}')
-
-                if peft_config is not None:
-                    with open(
-                            os.path.join(checkpoint_path,
-                                         'adapter_config.json')) as _f:
-                        adapter_config = json.load(_f)
-
-                    adapter_config['base_model_name_or_path'] = str(
-                        tmp_path / 'base-model')
-
-                    with open(
-                            os.path.join(checkpoint_path,
-                                         'adapter_config.json'), 'w') as _f:
-                        json.dump(adapter_config, _f)
-
-                # Load the last huggingface checkpoint
-                loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
-                    checkpoint_path,
-                    trust_remote_code=True,
-                )
-
-            # Check that the loaded model has the correct precision, and then set it back
-            # to the original for the equivalence check
-            if peft_config is None:
-                assert loaded_model.config.torch_dtype == precision
-                loaded_model.config.torch_dtype = original_model.model.config.torch_dtype
-
-            if model == 'mpt':
-                # Check that we have correctly set these attributes, and then set them back
-                # to the original for the equivalence check
-                assert loaded_model.config.attn_config['attn_impl'] == 'torch'
-                assert loaded_model.config.init_device == 'cpu'
-                loaded_model.config.attn_config[
-                    'attn_impl'] = original_model.model.config.attn_config[
-                        'attn_impl']
-                loaded_model.config.init_device = original_model.model.config.init_device
-
-            loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
-                os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{batches_per_epoch}'),
-                trust_remote_code=True,
-            )
-
-            check_hf_model_equivalence(
-                trainer.state.model.model.to(precision) if fsdp_state_dict_type
-                is not None else trainer.state.model.module.model.to(precision),
-                loaded_model,
-                just_lora=peft_config is not None)
-            check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer)
+        _assert_checkpoint_equivalence(
+            tmp_path=tmp_path,
+            expected_normal_checkpoints=expected_normal_checkpoints,
+            expected_hf_checkpoints=expected_hf_checkpoints,
+            trainer=trainer,
+            batches_per_epoch=batches_per_epoch,
+            original_model=original_model,
+            precision=precision,
+            model=model,
+            tokenizer=tokenizer,
+            fsdp_state_dict_type=fsdp_state_dict_type,
+            peft_config=peft_config)
 
     dist.barrier()
     delete_transformers_cache()