From 6a058a60a603f06f1c32ebcfee156259ef1230db Mon Sep 17 00:00:00 2001
From: Anna <anna@mosaicml.com>
Date: Fri, 1 Dec 2023 17:02:04 -0800
Subject: [PATCH] Reorganize tests to make them easier to find (#768)

* Add eval loader to eval script

* small input tests

* updates

* fix typing and formatting

* fixes, add tests

* remove circular dependency

* tests pass

* nits + small fixes

* add metrics at the end, refactor to put icl/gauntlet as helpers

* NOT

* metrics instead of models, add unit tests

* Move tests into directories

* add copyright to inits

* fix relative paths

* fixes

* revert gauntlet test change

* Support inputs_embeds (#687)

* support inputs_embeds

* update tests to test inputs_embeds

* make iids optional inputs to fwd

* remove check for both iids and inputs_embeds

in MPTForCausalLM. It is checked in the base model, and it is actually a common practice to pass both during autoregressive generation. Embeds are used first, then once the kvcache is nonempty, iids are used instead

* reorder kwargs

* add more tests

* fix device merge artifact in test_model.oy

* fix generate test

* yapf

* Better error message when test does not complete (#769)

* run script tests first

* comment out

* ascripts -> scripts

* bad dirs

* try this

* hacks

* add a note about a_scripts

---------

Co-authored-by: Sam Havens <sam@mosaicml.com>
---
 scripts/inference/convert_composer_to_hf.py   | 30 +++++++----
 tests/a_scripts/__init__.py                   |  6 +++
 tests/a_scripts/data_prep/__init__.py         |  2 +
 .../data_prep/test_convert_dataset_hf.py      | 28 ++++++++++
 .../data_prep/test_convert_dataset_json.py    | 27 ++++++++++
 .../data_prep}/test_convert_text_to_mds.py    |  8 +--
 tests/a_scripts/eval/__init__.py              |  2 +
 tests/{ => a_scripts/eval}/test_eval.py       | 34 ++++++------
 .../{ => a_scripts/eval}/test_eval_inputs.py  | 23 ++++----
 tests/a_scripts/inference/__init__.py         |  2 +
 .../inference/test_convert_composer_to_hf.py} | 22 +++-----
 tests/a_scripts/train/__init__.py             |  2 +
 .../train/test_train.py}                      | 21 ++------
 .../train}/test_train_inputs.py               |  9 +---
 tests/callbacks/__init__.py                   |  2 +
 .../test_eval_gauntlet_callback.py}           |  0
 tests/data/__init__.py                        |  2 +
 tests/{ => data}/test_dataloader.py           |  5 --
 tests/{ => data}/test_icl_datasets.py         |  2 +-
 tests/{ => data}/test_packing.py              |  0
 tests/{ => data}/test_tasks.yaml              |  0
 tests/data_utils.py                           | 21 ++++----
 tests/fixtures/autouse.py                     | 11 ++++
 tests/models/__init__.py                      |  2 +
 tests/models/hf/__init__.py                   |  2 +
 tests/{ => models/hf}/test_hf_config.py       |  0
 tests/{ => models/hf}/test_hf_mpt_gen.py      |  0
 tests/{ => models/hf}/test_hf_v_mpt.py        |  0
 .../models/inference_api_wrapper/__init__.py  |  2 +
 .../test_inference_api_eval_wrapper.py        |  0
 tests/models/layers/__init__.py               |  2 +
 .../layers}/test_flash_triton_torch.py        |  0
 .../layers}/test_huggingface_flash.py         |  0
 .../{ => models}/test_fsdp_act_checkpoint.py  |  0
 tests/{ => models}/test_model.py              |  0
 tests/{ => models}/test_mpt_gen.py            |  0
 tests/{ => models}/test_onnx.py               |  0
 tests/{ => models}/test_rope_dail_vs_hf.py    |  0
 tests/models/utils/__init__.py                |  2 +
 .../utils/test_param_init_fns.py}             |  0
 tests/optim/__init__.py                       |  2 +
 tests/{ => optim}/test_lion8b.py              |  0
 tests/{ => optim}/test_scheduler.py           |  0
 tests/test_data_prep_scripts.py               | 52 -------------------
 tests/tokenizers/__init__.py                  |  2 +
 tests/{ => tokenizers}/test_tiktoken.py       |  3 +-
 tests/{ => tokenizers}/test_tokenizer.py      |  0
 tests/utils/__init__.py                       |  2 +
 tests/{ => utils}/test_builders.py            |  0
 .../{ => utils}/test_model_download_utils.py  |  0
 tests/{ => utils}/test_prompt_files.py        |  0
 51 files changed, 176 insertions(+), 154 deletions(-)
 create mode 100644 tests/a_scripts/__init__.py
 create mode 100644 tests/a_scripts/data_prep/__init__.py
 create mode 100644 tests/a_scripts/data_prep/test_convert_dataset_hf.py
 create mode 100644 tests/a_scripts/data_prep/test_convert_dataset_json.py
 rename tests/{ => a_scripts/data_prep}/test_convert_text_to_mds.py (98%)
 create mode 100644 tests/a_scripts/eval/__init__.py
 rename tests/{ => a_scripts/eval}/test_eval.py (89%)
 rename tests/{ => a_scripts/eval}/test_eval_inputs.py (86%)
 create mode 100644 tests/a_scripts/inference/__init__.py
 rename tests/{test_hf_conversion_script.py => a_scripts/inference/test_convert_composer_to_hf.py} (99%)
 create mode 100644 tests/a_scripts/train/__init__.py
 rename tests/{test_training.py => a_scripts/train/test_train.py} (90%)
 rename tests/{ => a_scripts/train}/test_train_inputs.py (96%)
 create mode 100644 tests/callbacks/__init__.py
 rename tests/{test_eval_gauntlet.py => callbacks/test_eval_gauntlet_callback.py} (100%)
 create mode 100644 tests/data/__init__.py
 rename tests/{ => data}/test_dataloader.py (99%)
 rename tests/{ => data}/test_icl_datasets.py (98%)
 rename tests/{ => data}/test_packing.py (100%)
 rename tests/{ => data}/test_tasks.yaml (100%)
 create mode 100644 tests/models/__init__.py
 create mode 100644 tests/models/hf/__init__.py
 rename tests/{ => models/hf}/test_hf_config.py (100%)
 rename tests/{ => models/hf}/test_hf_mpt_gen.py (100%)
 rename tests/{ => models/hf}/test_hf_v_mpt.py (100%)
 create mode 100644 tests/models/inference_api_wrapper/__init__.py
 rename tests/{ => models/inference_api_wrapper}/test_inference_api_eval_wrapper.py (100%)
 create mode 100644 tests/models/layers/__init__.py
 rename tests/{ => models/layers}/test_flash_triton_torch.py (100%)
 rename tests/{ => models/layers}/test_huggingface_flash.py (100%)
 rename tests/{ => models}/test_fsdp_act_checkpoint.py (100%)
 rename tests/{ => models}/test_model.py (100%)
 rename tests/{ => models}/test_mpt_gen.py (100%)
 rename tests/{ => models}/test_onnx.py (100%)
 rename tests/{ => models}/test_rope_dail_vs_hf.py (100%)
 create mode 100644 tests/models/utils/__init__.py
 rename tests/{test_init_fn.py => models/utils/test_param_init_fns.py} (100%)
 create mode 100644 tests/optim/__init__.py
 rename tests/{ => optim}/test_lion8b.py (100%)
 rename tests/{ => optim}/test_scheduler.py (100%)
 delete mode 100644 tests/test_data_prep_scripts.py
 create mode 100644 tests/tokenizers/__init__.py
 rename tests/{ => tokenizers}/test_tiktoken.py (99%)
 rename tests/{ => tokenizers}/test_tokenizer.py (100%)
 create mode 100644 tests/utils/__init__.py
 rename tests/{ => utils}/test_builders.py (100%)
 rename tests/{ => utils}/test_model_download_utils.py (100%)
 rename tests/{ => utils}/test_prompt_files.py (100%)

diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
index 1b43762473..51afb105c8 100644
--- a/scripts/inference/convert_composer_to_hf.py
+++ b/scripts/inference/convert_composer_to_hf.py
@@ -168,19 +168,11 @@ def parse_args() -> Namespace:
     return parser.parse_args()
 
 
-def convert_composer_to_hf(args: Namespace) -> None:
+def _convert_composer_to_hf(args: Namespace) -> None:
     print()
     print('#' * 30)
     print('Converting Composer checkpoint to HuggingFace checkpoint format...')
 
-    # Register MPT auto classes so that this script works with MPT
-    # This script will not work without modification for other custom models,
-    # but will work for other HuggingFace causal LMs
-    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
-    CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
-    MPTConfig.register_for_auto_class()
-    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
-
     _, _, local_folder_path = parse_uri(args.hf_output_path)
 
     config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
@@ -296,5 +288,25 @@ def convert_composer_to_hf(args: Namespace) -> None:
     )
 
 
+def convert_composer_to_hf(args: Namespace) -> None:
+    # Register MPT auto classes so that this script works with MPT
+    # This script will not work without modification for other custom models,
+    # but will work for other HuggingFace causal LMs
+    from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+    CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
+    MPTConfig.register_for_auto_class()
+    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+
+    try:
+        _convert_composer_to_hf(args)
+    except Exception as e:
+        raise e
+    finally:
+        # Undo auto registration after running the script
+        del CONFIG_MAPPING._extra_content['mpt']
+        delattr(MPTConfig, '_auto_class')
+        delattr(MPTForCausalLM, '_auto_class')
+
+
 if __name__ == '__main__':
     convert_composer_to_hf(parse_args())
diff --git a/tests/a_scripts/__init__.py b/tests/a_scripts/__init__.py
new file mode 100644
index 0000000000..eb5c1d149e
--- /dev/null
+++ b/tests/a_scripts/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+# TODO: This test directory is called "a_scripts" to enforce that these tests are run
+# first. More clean up should be done to ensure tests can be run in any order and
+# don't leave around artifacts
diff --git a/tests/a_scripts/data_prep/__init__.py b/tests/a_scripts/data_prep/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/data_prep/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
new file mode 100644
index 0000000000..f226b0a4be
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
@@ -0,0 +1,28 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from argparse import Namespace
+from pathlib import Path
+
+from scripts.data_prep.convert_dataset_hf import main as main_hf
+
+
+def test_download_script_from_api(tmp_path: Path):
+    # test calling it directly
+    path = os.path.join(tmp_path, 'my-copy-c4-1')
+    main_hf(
+        Namespace(
+            **{
+                'dataset': 'c4',
+                'data_subset': 'en',
+                'splits': ['val_xsmall'],
+                'out_root': path,
+                'compression': None,
+                'concat_tokens': None,
+                'bos_text': None,
+                'eos_text': None,
+                'no_wrap': False,
+                'num_workers': None
+            }))
+    assert os.path.exists(path)
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_json.py b/tests/a_scripts/data_prep/test_convert_dataset_json.py
new file mode 100644
index 0000000000..179b8a701b
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_convert_dataset_json.py
@@ -0,0 +1,27 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from argparse import Namespace
+from pathlib import Path
+
+from scripts.data_prep.convert_dataset_json import main as main_json
+
+
+def test_json_script_from_api(tmp_path: Path):
+    # test calling it directly
+    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
+    main_json(
+        Namespace(
+            **{
+                'path': 'scripts/data_prep/example_data/arxiv.jsonl',
+                'out_root': path,
+                'compression': None,
+                'split': 'train',
+                'concat_tokens': None,
+                'bos_text': None,
+                'eos_text': None,
+                'no_wrap': False,
+                'num_workers': None
+            }))
+    assert os.path.exists(path)
diff --git a/tests/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
similarity index 98%
rename from tests/test_convert_text_to_mds.py
rename to tests/a_scripts/data_prep/test_convert_text_to_mds.py
index ab8c25bc2d..cc293a2cdd 100644
--- a/tests/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -2,13 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import sys
-
-import pytest
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 import pathlib
 from concurrent.futures import ProcessPoolExecutor
 from glob import glob
@@ -16,6 +9,7 @@
 from unittest.mock import Mock, patch
 
 import numpy as np
+import pytest
 from streaming import StreamingDataset
 from transformers import AutoTokenizer
 
diff --git a/tests/a_scripts/eval/__init__.py b/tests/a_scripts/eval/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/eval/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_eval.py b/tests/a_scripts/eval/test_eval.py
similarity index 89%
rename from tests/test_eval.py
rename to tests/a_scripts/eval/test_eval.py
index 2fc96bb7ad..e8d86903dc 100644
--- a/tests/test_eval.py
+++ b/tests/a_scripts/eval/test_eval.py
@@ -4,8 +4,7 @@
 import copy
 import os
 import pathlib
-import sys
-from typing import Any
+from typing import Any, Union
 
 import omegaconf as om
 import pytest
@@ -14,15 +13,10 @@
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.utils import build_tokenizer
+from scripts.eval.eval import main  # noqa: E402
 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall,
                               gpt_tiny_cfg)
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
-from scripts.eval.eval import main  # noqa: E402
-
 
 @pytest.fixture(autouse=True)
 def set_correct_cwd():
@@ -35,11 +29,16 @@ def set_correct_cwd():
         os.chdir('..')
 
 
-@pytest.fixture()
-def mock_saved_model_path():
-    # load the eval and model config
-    with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f:
+@pytest.fixture
+def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]:
+    yaml_path = os.path.join(foundry_dir, 'scripts/eval/yamls/test_eval.yaml')
+    with open(yaml_path, 'r', encoding='utf-8') as f:
         eval_cfg = om.OmegaConf.load(f)
+    return eval_cfg
+
+
+@pytest.fixture()
+def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]):
     model_cfg = eval_cfg.models[0]
     # set device to cpu
     device = 'cpu'
@@ -60,12 +59,11 @@ def mock_saved_model_path():
     os.remove(saved_model_path)
 
 
-def test_icl_eval(capfd: Any, mock_saved_model_path: Any):
-    with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f:
-        test_cfg = om.OmegaConf.load(f)
-    test_cfg.models[0].load_path = mock_saved_model_path
-    assert isinstance(test_cfg, om.DictConfig)
-    main(test_cfg)
+def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any,
+                  mock_saved_model_path: Any):
+    eval_cfg.models[0].load_path = mock_saved_model_path
+    assert isinstance(eval_cfg, om.DictConfig)
+    main(eval_cfg)
     out, _ = capfd.readouterr()
     expected_results = '| Category                    | Benchmark      | Subtask   |   Accuracy | Number few shot   | Model    |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai |           |          0 | 0-shot            | tiny_mpt |'
     assert expected_results in out
diff --git a/tests/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py
similarity index 86%
rename from tests/test_eval_inputs.py
rename to tests/a_scripts/eval/test_eval_inputs.py
index 83104b62b7..8694546c4f 100644
--- a/tests/test_eval_inputs.py
+++ b/tests/a_scripts/eval/test_eval_inputs.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import copy
 import os
-import sys
 import warnings
 
 import omegaconf
@@ -10,10 +9,6 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 from scripts.eval.eval import main  # noqa: E402
 
 
@@ -21,10 +16,12 @@ class TestHuggingFaceEvalYAMLInputs:
     """Validate and tests error handling for the input YAML file."""
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
-        conf_path: str = os.path.join(repo_dir,
-                                      'scripts/eval/yamls/hf_eval.yaml')
+        conf_path: str = os.path.join(
+            foundry_dir,
+            'scripts/eval/yamls/hf_eval.yaml',
+        )
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
         assert isinstance(test_cfg, DictConfig)
@@ -78,15 +75,17 @@ def test_optional_mispelled_params_raise_warning(self,
 class TestMPTEvalYAMLInputs:
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
-        conf_path: str = os.path.join(repo_dir,
-                                      'scripts/eval/yamls/mpt_eval.yaml')
+        conf_path: str = os.path.join(
+            foundry_dir,
+            'scripts/eval/yamls/mpt_eval.yaml',
+        )
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
 
         test_cfg.icl_tasks[0].dataset_uri = os.path.join(
-            repo_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri)
+            foundry_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri)
 
         # make tests use cpu initialized transformer models only
         test_cfg.models[0].model.init_device = 'cpu'
diff --git a/tests/a_scripts/inference/__init__.py b/tests/a_scripts/inference/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/inference/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_hf_conversion_script.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
similarity index 99%
rename from tests/test_hf_conversion_script.py
rename to tests/a_scripts/inference/test_convert_composer_to_hf.py
index f9191cd701..d21c942dee 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -4,34 +4,26 @@
 import math
 import os
 import pathlib
-import sys
-from typing import Callable
-from unittest.mock import ANY, MagicMock, patch
-
-from composer import Trainer
-from composer.loggers import MLFlowLogger
-from composer.utils import dist, get_device, using_torch_2
-
-from llmfoundry.callbacks import HuggingFaceCheckpointer
-from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 import shutil
 from argparse import Namespace
-from typing import Optional, cast
+from typing import Callable, Optional, cast
+from unittest.mock import ANY, MagicMock, patch
 
 import pytest
 import torch
 import transformers
+from composer import Trainer
+from composer.loggers import MLFlowLogger
+from composer.utils import dist, get_device, using_torch_2
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.data.finetuning import build_finetuning_dataloader
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_optimizer, build_tokenizer
 from scripts.inference.convert_composer_to_hf import convert_composer_to_hf
 from tests.data_utils import make_tiny_ft_dataset
diff --git a/tests/a_scripts/train/__init__.py b/tests/a_scripts/train/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/a_scripts/train/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_training.py b/tests/a_scripts/train/test_train.py
similarity index 90%
rename from tests/test_training.py
rename to tests/a_scripts/train/test_train.py
index 3cd2963100..62075383cc 100644
--- a/tests/test_training.py
+++ b/tests/a_scripts/train/test_train.py
@@ -1,9 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 import copy
-import os
 import pathlib
-from typing import Any, Optional
+from typing import Optional
 
 import pytest
 from composer.loggers import InMemoryLogger
@@ -16,22 +15,10 @@
                               gpt_tiny_cfg)
 
 
-@pytest.fixture(autouse=False)
-def set_correct_cwd():
-    if not os.getcwd().endswith('llm-foundry/scripts'):
-        os.chdir('scripts')
-
-    yield
-
-    if os.getcwd().endswith('llm-foundry/scripts'):
-        os.chdir('..')
-
-
 @pytest.mark.parametrize('averages', [{
     'core_average': ['language_understanding_lite']
 }, None])
-def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
-                        tmp_path: pathlib.Path):
+def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path):
     """Test training run with a small dataset."""
     dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(dataset_name, 'cpu')
@@ -40,7 +27,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
             'label':
                 'lambada_openai',
             'dataset_uri':
-                'eval/local_data/language_understanding/lambada_openai_small.jsonl',
+                'scripts/eval/local_data/language_understanding/lambada_openai_small.jsonl',
             'num_fewshot': [0],
             'icl_task_type':
                 'language_modeling'
@@ -110,7 +97,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
         -1][-1] == 0
 
 
-def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path):
+def test_train_multi_eval(tmp_path: pathlib.Path):
     """Test training run with multiple eval datasets."""
     c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
diff --git a/tests/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py
similarity index 96%
rename from tests/test_train_inputs.py
rename to tests/a_scripts/train/test_train_inputs.py
index 2ed1c9c239..17eca26587 100644
--- a/tests/test_train_inputs.py
+++ b/tests/a_scripts/train/test_train_inputs.py
@@ -3,7 +3,6 @@
 import copy
 import json
 import os
-import sys
 import warnings
 
 import omegaconf
@@ -11,10 +10,6 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 from scripts.train.train import main  # noqa: E402
 
 
@@ -54,10 +49,10 @@ class TestTrainingYAMLInputs:
     """Validate and tests error handling for the input YAML file."""
 
     @pytest.fixture
-    def cfg(self) -> DictConfig:
+    def cfg(self, foundry_dir: str) -> DictConfig:
         """Create YAML cfg fixture for testing purposes."""
         conf_path: str = os.path.join(
-            repo_dir, 'scripts/train/yamls/pretrain/testing.yaml')
+            foundry_dir, 'scripts/train/yamls/pretrain/testing.yaml')
         with open(conf_path, 'r', encoding='utf-8') as config:
             test_cfg = om.load(config)
         assert isinstance(test_cfg, DictConfig)
diff --git a/tests/callbacks/__init__.py b/tests/callbacks/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/callbacks/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_eval_gauntlet.py b/tests/callbacks/test_eval_gauntlet_callback.py
similarity index 100%
rename from tests/test_eval_gauntlet.py
rename to tests/callbacks/test_eval_gauntlet_callback.py
diff --git a/tests/data/__init__.py b/tests/data/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/data/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_dataloader.py b/tests/data/test_dataloader.py
similarity index 99%
rename from tests/test_dataloader.py
rename to tests/data/test_dataloader.py
index 2e9039644b..0f5f506e22 100644
--- a/tests/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -5,7 +5,6 @@
 import pathlib
 import random
 import shutil
-import sys
 import tempfile
 from argparse import Namespace
 from typing import Literal, Optional, Union
@@ -26,10 +25,6 @@
                                        build_text_dataloader,
                                        get_tokens_per_batch_func)
 from llmfoundry.utils.builders import build_tokenizer
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
 from scripts.data_prep.convert_dataset_hf import main as main_hf
 from tests.data_utils import make_tiny_ft_dataset
 
diff --git a/tests/test_icl_datasets.py b/tests/data/test_icl_datasets.py
similarity index 98%
rename from tests/test_icl_datasets.py
rename to tests/data/test_icl_datasets.py
index 28d12df91d..3a730fdf19 100644
--- a/tests/test_icl_datasets.py
+++ b/tests/data/test_icl_datasets.py
@@ -10,7 +10,7 @@
 from llmfoundry.utils.builders import build_icl_evaluators
 
 
-def load_icl_config(conf_path: str = 'tests/test_tasks.yaml'):
+def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'):
     with open(conf_path) as f:
         test_cfg = om.load(f)
     return test_cfg
diff --git a/tests/test_packing.py b/tests/data/test_packing.py
similarity index 100%
rename from tests/test_packing.py
rename to tests/data/test_packing.py
diff --git a/tests/test_tasks.yaml b/tests/data/test_tasks.yaml
similarity index 100%
rename from tests/test_tasks.yaml
rename to tests/data/test_tasks.yaml
diff --git a/tests/data_utils.py b/tests/data_utils.py
index efb4f6d7cf..a0ad6bcd13 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -1,14 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import sys
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-
 import json
+import os
 import pathlib
 import shutil
 from argparse import Namespace
@@ -120,10 +114,14 @@ def create_arxiv_dataset(path: pathlib.Path) -> str:
     arxiv_dir = os.path.join(path, f'my-copy-arxiv')
     downloaded_split = 'train'
 
+    arxiv_path = 'data_prep/example_data/arxiv.jsonl'
+    if not os.getcwd().endswith('scripts'):
+        arxiv_path = os.path.join('scripts', arxiv_path)
+
     main_json(
         Namespace(
             **{
-                'path': 'data_prep/example_data/arxiv.jsonl',
+                'path': arxiv_path,
                 'out_root': arxiv_dir,
                 'compression': None,
                 'split': downloaded_split,
@@ -139,8 +137,11 @@ def create_arxiv_dataset(path: pathlib.Path) -> str:
 
 def gpt_tiny_cfg(dataset_name: str, device: str):
     """Create gpt tiny cfg."""
-    conf_path: str = os.path.join(repo_dir,
-                                  'scripts/train/yamls/pretrain/testing.yaml')
+    from tests.fixtures.autouse import REPO_DIR
+    conf_path: str = os.path.join(
+        REPO_DIR,
+        'scripts/train/yamls/pretrain/testing.yaml',
+    )
     with open(conf_path) as f:
         test_cfg = om.load(f)
     assert isinstance(test_cfg, DictConfig)
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
index c51ccfacb0..75caa6c941 100644
--- a/tests/fixtures/autouse.py
+++ b/tests/fixtures/autouse.py
@@ -2,11 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import gc
+import os
+import sys
 
 import pytest
 import torch
 from composer.utils import dist, get_device, reproducibility
 
+# Add llm-foundry repo root to path so we can import scripts in the tests
+REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+sys.path.append(REPO_DIR)
+
 
 @pytest.fixture(autouse=True)
 def initialize_dist(request: pytest.FixtureRequest):
@@ -33,6 +39,11 @@ def random_seed() -> int:
     return 17
 
 
+@pytest.fixture
+def foundry_dir() -> str:
+    return REPO_DIR
+
+
 @pytest.fixture(autouse=True)
 def seed_all(random_seed: int):
     """Sets the seed for reproducibility."""
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/models/hf/__init__.py b/tests/models/hf/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/hf/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_hf_config.py b/tests/models/hf/test_hf_config.py
similarity index 100%
rename from tests/test_hf_config.py
rename to tests/models/hf/test_hf_config.py
diff --git a/tests/test_hf_mpt_gen.py b/tests/models/hf/test_hf_mpt_gen.py
similarity index 100%
rename from tests/test_hf_mpt_gen.py
rename to tests/models/hf/test_hf_mpt_gen.py
diff --git a/tests/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py
similarity index 100%
rename from tests/test_hf_v_mpt.py
rename to tests/models/hf/test_hf_v_mpt.py
diff --git a/tests/models/inference_api_wrapper/__init__.py b/tests/models/inference_api_wrapper/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/inference_api_wrapper/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py
similarity index 100%
rename from tests/test_inference_api_eval_wrapper.py
rename to tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py
diff --git a/tests/models/layers/__init__.py b/tests/models/layers/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/layers/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py
similarity index 100%
rename from tests/test_flash_triton_torch.py
rename to tests/models/layers/test_flash_triton_torch.py
diff --git a/tests/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py
similarity index 100%
rename from tests/test_huggingface_flash.py
rename to tests/models/layers/test_huggingface_flash.py
diff --git a/tests/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
similarity index 100%
rename from tests/test_fsdp_act_checkpoint.py
rename to tests/models/test_fsdp_act_checkpoint.py
diff --git a/tests/test_model.py b/tests/models/test_model.py
similarity index 100%
rename from tests/test_model.py
rename to tests/models/test_model.py
diff --git a/tests/test_mpt_gen.py b/tests/models/test_mpt_gen.py
similarity index 100%
rename from tests/test_mpt_gen.py
rename to tests/models/test_mpt_gen.py
diff --git a/tests/test_onnx.py b/tests/models/test_onnx.py
similarity index 100%
rename from tests/test_onnx.py
rename to tests/models/test_onnx.py
diff --git a/tests/test_rope_dail_vs_hf.py b/tests/models/test_rope_dail_vs_hf.py
similarity index 100%
rename from tests/test_rope_dail_vs_hf.py
rename to tests/models/test_rope_dail_vs_hf.py
diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/models/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_init_fn.py b/tests/models/utils/test_param_init_fns.py
similarity index 100%
rename from tests/test_init_fn.py
rename to tests/models/utils/test_param_init_fns.py
diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/optim/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_lion8b.py b/tests/optim/test_lion8b.py
similarity index 100%
rename from tests/test_lion8b.py
rename to tests/optim/test_lion8b.py
diff --git a/tests/test_scheduler.py b/tests/optim/test_scheduler.py
similarity index 100%
rename from tests/test_scheduler.py
rename to tests/optim/test_scheduler.py
diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py
deleted file mode 100644
index 4fe5ed7e64..0000000000
--- a/tests/test_data_prep_scripts.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2022 MosaicML LLM Foundry authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import sys
-from argparse import Namespace
-from pathlib import Path
-
-# Add repo root to path so we can import scripts and test it
-repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(repo_dir)
-from scripts.data_prep.convert_dataset_hf import main as main_hf
-from scripts.data_prep.convert_dataset_json import main as main_json
-
-
-def test_download_script_from_api(tmp_path: Path):
-    # test calling it directly
-    path = os.path.join(tmp_path, 'my-copy-c4-1')
-    main_hf(
-        Namespace(
-            **{
-                'dataset': 'c4',
-                'data_subset': 'en',
-                'splits': ['val_xsmall'],
-                'out_root': path,
-                'compression': None,
-                'concat_tokens': None,
-                'bos_text': None,
-                'eos_text': None,
-                'no_wrap': False,
-                'num_workers': None
-            }))
-    assert os.path.exists(path)
-
-
-def test_json_script_from_api(tmp_path: Path):
-    # test calling it directly
-    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
-    main_json(
-        Namespace(
-            **{
-                'path': 'scripts/data_prep/example_data/arxiv.jsonl',
-                'out_root': path,
-                'compression': None,
-                'split': 'train',
-                'concat_tokens': None,
-                'bos_text': None,
-                'eos_text': None,
-                'no_wrap': False,
-                'num_workers': None
-            }))
-    assert os.path.exists(path)
diff --git a/tests/tokenizers/__init__.py b/tests/tokenizers/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/tokenizers/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
similarity index 99%
rename from tests/test_tiktoken.py
rename to tests/tokenizers/test_tiktoken.py
index fe3db41d50..60907092c8 100644
--- a/tests/test_tiktoken.py
+++ b/tests/tokenizers/test_tiktoken.py
@@ -9,8 +9,9 @@
 
 from llmfoundry.tokenizers.tiktoken import (TiktokenTokenizerWrapper,
                                             bytes_to_unicode)
+from tests.a_scripts.inference.test_convert_composer_to_hf import \
+    check_hf_tokenizer_equivalence
 from tests.horrible_strings import HORRIBLE_STRINGS
-from tests.test_hf_conversion_script import check_hf_tokenizer_equivalence
 
 if TYPE_CHECKING:
     from tiktoken.core import Encoding
diff --git a/tests/test_tokenizer.py b/tests/tokenizers/test_tokenizer.py
similarity index 100%
rename from tests/test_tokenizer.py
rename to tests/tokenizers/test_tokenizer.py
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
new file mode 100644
index 0000000000..f6c1f9f3ab
--- /dev/null
+++ b/tests/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/test_builders.py b/tests/utils/test_builders.py
similarity index 100%
rename from tests/test_builders.py
rename to tests/utils/test_builders.py
diff --git a/tests/test_model_download_utils.py b/tests/utils/test_model_download_utils.py
similarity index 100%
rename from tests/test_model_download_utils.py
rename to tests/utils/test_model_download_utils.py
diff --git a/tests/test_prompt_files.py b/tests/utils/test_prompt_files.py
similarity index 100%
rename from tests/test_prompt_files.py
rename to tests/utils/test_prompt_files.py