diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index a29dee7683..4b80ffef54 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -47,25 +47,46 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 
 __all__ = ['dataset_constructor']
 
+_ALLOWED_RESPONSE_KEYS = {'response', 'completion'}
+_ALLOWED_PROMPT_KEYS = {'prompt'}
+
 
 def _tokenize_formatted_example(
         example: Dict[str, Any],
         tokenizer: PreTrainedTokenizerBase) -> Dict[str, List[int]]:
-    if ('prompt' not in example) or ('response' not in example):
+    """Tokenize a formatted example and validate expected keys."""
+    example_keys = set(example.keys())
+    prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS)
+    response_keys = example_keys.intersection(_ALLOWED_RESPONSE_KEYS)
+
+    if len(prompt_keys) != 1:
+        raise KeyError(
+            f'Unable to tokenize example because {len(prompt_keys)} of the allowed prompt keys ' +\
+            f'were present in {example_keys=}. Please specify exactly one. {_ALLOWED_PROMPT_KEYS=}'
+        )
+
+    if len(response_keys) != 1:
         raise KeyError(
-            'Unable to tokenize example because it has not been properly formatted. ' +\
-            '"prompt" and "response" are required keys but at least one was missing ' +\
-            f'from {example=}.'
+            f'Unable to tokenize example because {len(response_keys)} of the allowed response keys ' +\
+            f'were present in {example_keys=}. Please specify exactly one. {_ALLOWED_RESPONSE_KEYS=}'
         )
-    if not isinstance(example['prompt'], str):
+
+    prompt_key = prompt_keys.pop()
+    response_key = response_keys.pop()
+    prompt = example[prompt_key]
+    response = example[response_key]
+
+    if not isinstance(prompt, str):
         raise TypeError(
-            f'Unable to tokenize example because "prompt" was not a string. {example=}'
+            f'Unable to tokenize example because {prompt_key} was not a string. {example=}'
         )
-    if not isinstance(example['response'], str):
+
+    if not isinstance(response, str):
         raise TypeError(
-            f'Unable to tokenize example because "response" was not a string. {example=}'
+            f'Unable to tokenize example because {response_key} was not a string. {example=}'
         )
-    return tokenizer(text=example['prompt'], text_target=example['response'])
+
+    return tokenizer(text=prompt, text_target=response)
 
 
 class StreamingFinetuningDataset(StreamingDataset):
diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
index 1b43762473..51afb105c8 100644
--- a/scripts/inference/convert_composer_to_hf.py
+++ b/scripts/inference/convert_composer_to_hf.py
@@ -168,19 +168,11 @@ def parse_args() -> Namespace:
     return parser.parse_args()
 
 
-def convert_composer_to_hf(args: Namespace) -> None:
+def _convert_composer_to_hf(args: Namespace) -> None:
     print()
     print('#' * 30)
     print('Converting Composer checkpoint to HuggingFace checkpoint format...')
 
-    # Register MPT auto classes so that this script works with MPT
-    # This script will not work without modification for other custom models,
-    # but will work for other HuggingFace causal LMs
-    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
-    CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
-    MPTConfig.register_for_auto_class()
-    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
-
     _, _, local_folder_path = parse_uri(args.hf_output_path)
 
     config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
@@ -296,5 +288,25 @@ def convert_composer_to_hf(args: Namespace) -> None:
     )
 
 
+def convert_composer_to_hf(args: Namespace) -> None:
+    # Register MPT auto classes so that this script works with MPT
+    # This script will not work without modification for other custom models,
+    # but will work for other HuggingFace causal LMs
+    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+    CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
+    MPTConfig.register_for_auto_class()
+    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+
+    try:
+        _convert_composer_to_hf(args)
+    except Exception as e:
+        raise e
+    finally:
+        # Undo auto registration after running the script
+        del CONFIG_MAPPING._extra_content['mpt']
+        delattr(MPTConfig, '_auto_class')
+        delattr(MPTForCausalLM, '_auto_class')
+
+
 if __name__ == '__main__':
     convert_composer_to_hf(parse_args())
diff --git a/tests/a_scripts/__init__.py b/tests/a_scripts/__init__.py
new file mode 100644
index 0000000000..eb5c1d149e
--- /dev/null
+++ b/tests/a_scripts/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+# TODO: This test directory is called "a_scripts" to enforce that these tests are run
+# first. More clean up should be done to ensure tests can be run in any order and
+# don't leave around artifacts
diff --git a/tests/a_scripts/data_prep/__init__.py b/tests/a_scripts/data_prep/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/data_prep/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
new file mode 100644
index 0000000000..f226b0a4be
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
@@ -0,0 +1,28 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from argparse import Namespace
+from pathlib import Path
+
+from scripts.data_prep.convert_dataset_hf import main as main_hf
+
+
+def test_download_script_from_api(tmp_path: Path):
+    # test calling it directly
+    path = os.path.join(tmp_path, 'my-copy-c4-1')
+    main_hf(
+        Namespace(
+            **{
+                'dataset': 'c4',
+                'data_subset': 'en',
+                'splits': ['val_xsmall'],
+                'out_root': path,
+                'compression': None,
+                'concat_tokens': None,
+                'bos_text': None,
+                'eos_text': None,
+                'no_wrap': False,
+                'num_workers': None
+            }))
+    assert os.path.exists(path)
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_json.py b/tests/a_scripts/data_prep/test_convert_dataset_json.py
new file mode 100644
index 0000000000..179b8a701b
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_convert_dataset_json.py
@@ -0,0 +1,27 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from argparse import Namespace
+from pathlib import Path
+
+from scripts.data_prep.convert_dataset_json import main as main_json
+
+
+def test_json_script_from_api(tmp_path: Path):
+    # test calling it directly
+    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
+    main_json(
+        Namespace(
+            **{
+                'path': 'scripts/data_prep/example_data/arxiv.jsonl',
+                'out_root': path,
+                'compression': None,
+                'split': 'train',
+                'concat_tokens': None,
+                'bos_text': None,
+                'eos_text': None,
+                'no_wrap': False,
+                'num_workers': None
+            }))
+    assert os.path.exists(path)
diff --git a/tests/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
similarity index 98%
rename from tests/test_convert_text_to_mds.py
rename to tests/a_scripts/data_prep/test_convert_text_to_mds.py
index ab8c25bc2d..cc293a2cdd 100644
--- a/tests/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -2,13 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import sys
-
-import pytest
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 import pathlib
 from concurrent.futures import ProcessPoolExecutor
 from glob import glob
@@ -16,6 +9,7 @@
 from unittest.mock import Mock, patch
 
 import numpy as np
+import pytest
 from streaming import StreamingDataset
 from transformers import AutoTokenizer
 
diff --git a/tests/a_scripts/eval/__init__.py b/tests/a_scripts/eval/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/eval/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_eval.py b/tests/a_scripts/eval/test_eval.py
similarity index 89%
rename from tests/test_eval.py
rename to tests/a_scripts/eval/test_eval.py
index 2fc96bb7ad..e8d86903dc 100644
--- a/tests/test_eval.py
+++ b/tests/a_scripts/eval/test_eval.py
@@ -4,8 +4,7 @@
 import copy
 import os
 import pathlib
-import sys
-from typing import Any
+from typing import Any, Union
 
 import omegaconf as om
 import pytest
@@ -14,15 +13,10 @@
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.utils import build_tokenizer
+from scripts.eval.eval import main  # noqa: E402
 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall,
                               gpt_tiny_cfg)
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
-from scripts.eval.eval import main  # noqa: E402
-
 
 @pytest.fixture(autouse=True)
 def set_correct_cwd():
@@ -35,11 +29,16 @@ def set_correct_cwd():
         os.chdir('..')
 
 
-@pytest.fixture()
-def mock_saved_model_path():
-    # load the eval and model config
-    with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f:
+@pytest.fixture
+def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]:
+    yaml_path = os.path.join(foundry_dir, 'scripts/eval/yamls/test_eval.yaml')
+    with open(yaml_path, 'r', encoding='utf-8') as f:
         eval_cfg = om.OmegaConf.load(f)
+    return eval_cfg
+
+
+@pytest.fixture()
+def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]):
     model_cfg = eval_cfg.models[0]
     # set device to cpu
     device = 'cpu'
@@ -60,12 +59,11 @@ def mock_saved_model_path():
     os.remove(saved_model_path)
 
 
-def test_icl_eval(capfd: Any, mock_saved_model_path: Any):
-    with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f:
-        test_cfg = om.OmegaConf.load(f)
-    test_cfg.models[0].load_path = mock_saved_model_path
-    assert isinstance(test_cfg, om.DictConfig)
-    main(test_cfg)
+def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any,
+                  mock_saved_model_path: Any):
+    eval_cfg.models[0].load_path = mock_saved_model_path
+    assert isinstance(eval_cfg, om.DictConfig)
+    main(eval_cfg)
     out, _ = capfd.readouterr()
     expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt |'
     assert expected_results in out
diff --git a/tests/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py
similarity index 86%
rename from tests/test_eval_inputs.py
rename to tests/a_scripts/eval/test_eval_inputs.py
index 83104b62b7..8694546c4f 100644
--- a/tests/test_eval_inputs.py
+++ b/tests/a_scripts/eval/test_eval_inputs.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import copy
 import os
-import sys
 import warnings
 
 import omegaconf
@@ -10,10 +9,6 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 from scripts.eval.eval import main  # noqa: E402
 
 
@@ -21,10 +16,12 @@ class TestHuggingFaceEvalYAMLInputs:
     """Validate and tests error handling for the input YAML file."""
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
-        conf_path: str = os.path.join(repo_dir,
-                                      'scripts/eval/yamls/hf_eval.yaml')
+        conf_path: str = os.path.join(
+            foundry_dir,
+            'scripts/eval/yamls/hf_eval.yaml',
+        )
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
         assert isinstance(test_cfg, DictConfig)
@@ -78,15 +75,17 @@ def test_optional_mispelled_params_raise_warning(self,
 class TestMPTEvalYAMLInputs:
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
-        conf_path: str = os.path.join(repo_dir,
-                                      'scripts/eval/yamls/mpt_eval.yaml')
+        conf_path: str = os.path.join(
+            foundry_dir,
+            'scripts/eval/yamls/mpt_eval.yaml',
+        )
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
 
         test_cfg.icl_tasks[0].dataset_uri = os.path.join(
-            repo_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri)
+            foundry_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri)
 
         # make tests use cpu initialized transformer models only
         test_cfg.models[0].model.init_device = 'cpu'
diff --git a/tests/a_scripts/inference/__init__.py b/tests/a_scripts/inference/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/inference/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_hf_conversion_script.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
similarity index 99%
rename from tests/test_hf_conversion_script.py
rename to tests/a_scripts/inference/test_convert_composer_to_hf.py
index f9191cd701..d21c942dee 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -4,34 +4,26 @@
 import math
 import os
 import pathlib
-import sys
-from typing import Callable
-from unittest.mock import ANY, MagicMock, patch
-
-from composer import Trainer
-from composer.loggers import MLFlowLogger
-from composer.utils import dist, get_device, using_torch_2
-
-from llmfoundry.callbacks import HuggingFaceCheckpointer
-from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 import shutil
 from argparse import Namespace
-from typing import Optional, cast
+from typing import Callable, Optional, cast
+from unittest.mock import ANY, MagicMock, patch
 
 import pytest
 import torch
 import transformers
+from composer import Trainer
+from composer.loggers import MLFlowLogger
+from composer.utils import dist, get_device, using_torch_2
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.data.finetuning import build_finetuning_dataloader
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_optimizer, build_tokenizer
 from scripts.inference.convert_composer_to_hf import convert_composer_to_hf
 from tests.data_utils import make_tiny_ft_dataset
diff --git a/tests/a_scripts/train/__init__.py b/tests/a_scripts/train/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/train/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_training.py b/tests/a_scripts/train/test_train.py
similarity index 90%
rename from tests/test_training.py
rename to tests/a_scripts/train/test_train.py
index 3cd2963100..62075383cc 100644
--- a/tests/test_training.py
+++ b/tests/a_scripts/train/test_train.py
@@ -1,9 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 import copy
-import os
 import pathlib
-from typing import Any, Optional
+from typing import Optional
 
 import pytest
 from composer.loggers import InMemoryLogger
@@ -16,22 +15,10 @@
                               gpt_tiny_cfg)
 
 
-@pytest.fixture(autouse=False)
-def set_correct_cwd():
-    if not os.getcwd().endswith('llm-foundry/scripts'):
-        os.chdir('scripts')
-
-    yield
-
-    if os.getcwd().endswith('llm-foundry/scripts'):
-        os.chdir('..')
-
-
 @pytest.mark.parametrize('averages', [{
     'core_average': ['language_understanding_lite']
 }, None])
-def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
-                        tmp_path: pathlib.Path):
+def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path):
     """Test training run with a small dataset."""
     dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(dataset_name, 'cpu')
@@ -40,7 +27,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
             'label':
                 'lambada_openai',
             'dataset_uri':
-                'eval/local_data/language_understanding/lambada_openai_small.jsonl',
+                'scripts/eval/local_data/language_understanding/lambada_openai_small.jsonl',
             'num_fewshot': [0],
             'icl_task_type':
                 'language_modeling'
@@ -110,7 +97,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
         -1][-1] == 0
 
 
-def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):
+def test_train_multi_eval(tmp_path: pathlib.Path):
     """Test training run with multiple eval datasets."""
     c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
diff --git a/tests/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py
similarity index 96%
rename from tests/test_train_inputs.py
rename to tests/a_scripts/train/test_train_inputs.py
index 2ed1c9c239..17eca26587 100644
--- a/tests/test_train_inputs.py
+++ b/tests/a_scripts/train/test_train_inputs.py
@@ -3,7 +3,6 @@
 import copy
 import json
 import os
-import sys
 import warnings
 
 import omegaconf
@@ -11,10 +10,6 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 from scripts.train.train import main  # noqa: E402
 
 
@@ -54,10 +49,10 @@ class TestTrainingYAMLInputs:
     """Validate and tests error handling for the input YAML file."""
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
         conf_path: str = os.path.join(
-            repo_dir, 'scripts/train/yamls/pretrain/testing.yaml')
+            foundry_dir, 'scripts/train/yamls/pretrain/testing.yaml')
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
         assert isinstance(test_cfg, DictConfig)
diff --git a/tests/callbacks/__init__.py b/tests/callbacks/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/callbacks/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_eval_gauntlet.py b/tests/callbacks/test_eval_gauntlet_callback.py
similarity index 100%
rename from tests/test_eval_gauntlet.py
rename to tests/callbacks/test_eval_gauntlet_callback.py
diff --git a/tests/data/__init__.py b/tests/data/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/data/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_dataloader.py b/tests/data/test_dataloader.py
similarity index 94%
rename from tests/test_dataloader.py
rename to tests/data/test_dataloader.py
index 2e9039644b..747021e82a 100644
--- a/tests/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -5,7 +5,6 @@
 import pathlib
 import random
 import shutil
-import sys
 import tempfile
 from argparse import Namespace
 from typing import Literal, Optional, Union
@@ -22,14 +21,13 @@
 from llmfoundry import (build_finetuning_dataloader,
                         build_text_denoising_dataloader)
 from llmfoundry.data import build_dataloader
+from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS,
+                                              _ALLOWED_RESPONSE_KEYS,
+                                              _tokenize_formatted_example)
 from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
                                        build_text_dataloader,
                                        get_tokens_per_batch_func)
 from llmfoundry.utils.builders import build_tokenizer
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 from scripts.data_prep.convert_dataset_hf import main as main_hf
 from tests.data_utils import make_tiny_ft_dataset
 
@@ -360,10 +358,8 @@ def test_finetuning_dataloader_small_data(dataset_size: int,
     if (dist.get_world_size() * device_batch_size > dataset_size) and drop_last:
         error_context = pytest.raises(ValueError, match='Your dataset')
     if invalid_dataset:
-        error_context = pytest.raises(
-            TypeError,
-            match='Unable to tokenize example because "prompt" was not a string'
-        )
+        error_context = pytest.raises(TypeError,
+                                      match='Unable to tokenize example')
 
     with error_context:
         _ = build_finetuning_dataloader(cfg, tokenizer, device_batch_size)
@@ -372,6 +368,39 @@ def test_finetuning_dataloader_small_data(dataset_size: int,
         shutil.rmtree(tiny_dataset_folder_path)
 
 
+def test_tokenize_example_malformed():
+    no_keys = {}
+    no_prompt_key = {'response': 'response'}
+    no_response_key = {'prompt': 'prompt'}
+    extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'}
+    extra_keys_with_response = {'response': 'response', 'extra': 'extra'}
+    multiple_allowed_response_keys = {
+        'prompt': 'prompt',
+        'response': 'response',
+        'completion': 'completion'
+    }
+
+    malformed_examples = [
+        no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt,
+        extra_keys_with_response, multiple_allowed_response_keys
+    ]
+
+    for example in malformed_examples:
+        with pytest.raises(KeyError):
+            _tokenize_formatted_example(example, MagicMock())
+
+
+def test_tokenize_example_well_formed():
+    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
+
+    for prompt_key in _ALLOWED_PROMPT_KEYS:
+        for response_key in _ALLOWED_RESPONSE_KEYS:
+            example = {prompt_key: 'prompt', response_key: 'response'}
+            tokenized_example = _tokenize_formatted_example(example, tokenizer)
+            assert 'input_ids' in tokenized_example
+            assert 'labels' in tokenized_example
+
+
 @pytest.mark.parametrize('split', ['train', 'custom', 'data'])
 def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str):
     tokenizer_name = 'gpt2'
diff --git a/tests/test_icl_datasets.py b/tests/data/test_icl_datasets.py
similarity index 98%
rename from tests/test_icl_datasets.py
rename to tests/data/test_icl_datasets.py
index 28d12df91d..3a730fdf19 100644
--- a/tests/test_icl_datasets.py
+++ b/tests/data/test_icl_datasets.py
@@ -10,7 +10,7 @@
 from llmfoundry.utils.builders import build_icl_evaluators
 
 
-def load_icl_config(conf_path: str = 'tests/test_tasks.yaml'):
+def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'):
     with open(conf_path) as f:
         test_cfg = om.load(f)
     return test_cfg
diff --git a/tests/test_packing.py b/tests/data/test_packing.py
similarity index 100%
rename from tests/test_packing.py
rename to tests/data/test_packing.py
diff --git a/tests/test_tasks.yaml b/tests/data/test_tasks.yaml
similarity index 100%
rename from tests/test_tasks.yaml
rename to tests/data/test_tasks.yaml
diff --git a/tests/data_utils.py b/tests/data_utils.py
index efb4f6d7cf..a0ad6bcd13 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -1,14 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import sys
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 import json
+import os
 import pathlib
 import shutil
 from argparse import Namespace
@@ -120,10 +114,14 @@ def create_arxiv_dataset(path: pathlib.Path) -> str:
     arxiv_dir = os.path.join(path, f'my-copy-arxiv')
     downloaded_split = 'train'
 
+    arxiv_path = 'data_prep/example_data/arxiv.jsonl'
+    if not os.getcwd().endswith('scripts'):
+        arxiv_path = os.path.join('scripts', arxiv_path)
+
     main_json(
         Namespace(
             **{
-                'path': 'data_prep/example_data/arxiv.jsonl',
+                'path': arxiv_path,
                 'out_root': arxiv_dir,
                 'compression': None,
                 'split': downloaded_split,
@@ -139,8 +137,11 @@ def create_arxiv_dataset(path: pathlib.Path) -> str:
 
 def gpt_tiny_cfg(dataset_name: str, device: str):
     """Create gpt tiny cfg."""
-    conf_path: str = os.path.join(repo_dir,
-                                  'scripts/train/yamls/pretrain/testing.yaml')
+    from tests.fixtures.autouse import REPO_DIR
+    conf_path: str = os.path.join(
+        REPO_DIR,
+        'scripts/train/yamls/pretrain/testing.yaml',
+    )
     with open(conf_path) as f:
         test_cfg = om.load(f)
     assert isinstance(test_cfg, DictConfig)
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
index c51ccfacb0..75caa6c941 100644
--- a/tests/fixtures/autouse.py
+++ b/tests/fixtures/autouse.py
@@ -2,11 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import gc
+import os
+import sys
 
 import pytest
 import torch
 from composer.utils import dist, get_device, reproducibility
 
+# Add llm-foundry repo root to path so we can import scripts in the tests
+REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+sys.path.append(REPO_DIR)
+
 
 @pytest.fixture(autouse=True)
 def initialize_dist(request: pytest.FixtureRequest):
@@ -33,6 +39,11 @@ def random_seed() -> int:
     return 17
 
 
+@pytest.fixture
+def foundry_dir() -> str:
+    return REPO_DIR
+
+
 @pytest.fixture(autouse=True)
 def seed_all(random_seed: int):
     """Sets the seed for reproducibility."""
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/models/hf/__init__.py b/tests/models/hf/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/hf/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_hf_config.py b/tests/models/hf/test_hf_config.py
similarity index 100%
rename from tests/test_hf_config.py
rename to tests/models/hf/test_hf_config.py
diff --git a/tests/test_hf_mpt_gen.py b/tests/models/hf/test_hf_mpt_gen.py
similarity index 100%
rename from tests/test_hf_mpt_gen.py
rename to tests/models/hf/test_hf_mpt_gen.py
diff --git a/tests/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py
similarity index 100%
rename from tests/test_hf_v_mpt.py
rename to tests/models/hf/test_hf_v_mpt.py
diff --git a/tests/models/inference_api_wrapper/__init__.py b/tests/models/inference_api_wrapper/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/inference_api_wrapper/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py
similarity index 100%
rename from tests/test_inference_api_eval_wrapper.py
rename to tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py
diff --git a/tests/models/layers/__init__.py b/tests/models/layers/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/layers/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py
similarity index 100%
rename from tests/test_flash_triton_torch.py
rename to tests/models/layers/test_flash_triton_torch.py
diff --git a/tests/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py
similarity index 100%
rename from tests/test_huggingface_flash.py
rename to tests/models/layers/test_huggingface_flash.py
diff --git a/tests/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
similarity index 100%
rename from tests/test_fsdp_act_checkpoint.py
rename to tests/models/test_fsdp_act_checkpoint.py
diff --git a/tests/test_model.py b/tests/models/test_model.py
similarity index 100%
rename from tests/test_model.py
rename to tests/models/test_model.py
diff --git a/tests/test_mpt_gen.py b/tests/models/test_mpt_gen.py
similarity index 100%
rename from tests/test_mpt_gen.py
rename to tests/models/test_mpt_gen.py
diff --git a/tests/test_onnx.py b/tests/models/test_onnx.py
similarity index 100%
rename from tests/test_onnx.py
rename to tests/models/test_onnx.py
diff --git a/tests/test_rope_dail_vs_hf.py b/tests/models/test_rope_dail_vs_hf.py
similarity index 100%
rename from tests/test_rope_dail_vs_hf.py
rename to tests/models/test_rope_dail_vs_hf.py
diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_init_fn.py b/tests/models/utils/test_param_init_fns.py
similarity index 100%
rename from tests/test_init_fn.py
rename to tests/models/utils/test_param_init_fns.py
diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/optim/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_lion8b.py b/tests/optim/test_lion8b.py
similarity index 100%
rename from tests/test_lion8b.py
rename to tests/optim/test_lion8b.py
diff --git a/tests/test_scheduler.py b/tests/optim/test_scheduler.py
similarity index 100%
rename from tests/test_scheduler.py
rename to tests/optim/test_scheduler.py
diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py
deleted file mode 100644
index 4fe5ed7e64..0000000000
--- a/tests/test_data_prep_scripts.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2022 MosaicML LLM Foundry authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import sys
-from argparse import Namespace
-from pathlib import Path
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-from scripts.data_prep.convert_dataset_hf import main as main_hf
-from scripts.data_prep.convert_dataset_json import main as main_json
-
-
-def test_download_script_from_api(tmp_path: Path):
-    # test calling it directly
-    path = os.path.join(tmp_path, 'my-copy-c4-1')
-    main_hf(
-        Namespace(
-            **{
-                'dataset': 'c4',
-                'data_subset': 'en',
-                'splits': ['val_xsmall'],
-                'out_root': path,
-                'compression': None,
-                'concat_tokens': None,
-                'bos_text': None,
-                'eos_text': None,
-                'no_wrap': False,
-                'num_workers': None
-            }))
-    assert os.path.exists(path)
-
-
-def test_json_script_from_api(tmp_path: Path):
-    # test calling it directly
-    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
-    main_json(
-        Namespace(
-            **{
-                'path': 'scripts/data_prep/example_data/arxiv.jsonl',
-                'out_root': path,
-                'compression': None,
-                'split': 'train',
-                'concat_tokens': None,
-                'bos_text': None,
-                'eos_text': None,
-                'no_wrap': False,
-                'num_workers': None
-            }))
-    assert os.path.exists(path)
diff --git a/tests/tokenizers/__init__.py b/tests/tokenizers/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/tokenizers/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
similarity index 99%
rename from tests/test_tiktoken.py
rename to tests/tokenizers/test_tiktoken.py
index fe3db41d50..60907092c8 100644
--- a/tests/test_tiktoken.py
+++ b/tests/tokenizers/test_tiktoken.py
@@ -9,8 +9,9 @@
 
 from llmfoundry.tokenizers.tiktoken import (TiktokenTokenizerWrapper,
                                             bytes_to_unicode)
+from tests.a_scripts.inference.test_convert_composer_to_hf import \
+    check_hf_tokenizer_equivalence
 from tests.horrible_strings import HORRIBLE_STRINGS
-from tests.test_hf_conversion_script import check_hf_tokenizer_equivalence
 
 if TYPE_CHECKING:
     from tiktoken.core import Encoding
diff --git a/tests/test_tokenizer.py b/tests/tokenizers/test_tokenizer.py
similarity index 100%
rename from tests/test_tokenizer.py
rename to tests/tokenizers/test_tokenizer.py
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_builders.py b/tests/utils/test_builders.py
similarity index 100%
rename from tests/test_builders.py
rename to tests/utils/test_builders.py
diff --git a/tests/test_model_download_utils.py b/tests/utils/test_model_download_utils.py
similarity index 100%
rename from tests/test_model_download_utils.py
rename to tests/utils/test_model_download_utils.py
diff --git a/tests/test_prompt_files.py b/tests/utils/test_prompt_files.py
similarity index 100%
rename from tests/test_prompt_files.py
rename to tests/utils/test_prompt_files.py