diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index a29dee7683..4b80ffef54 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -47,25 +47,46 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: __all__ = ['dataset_constructor'] +_ALLOWED_RESPONSE_KEYS = {'response', 'completion'} +_ALLOWED_PROMPT_KEYS = {'prompt'} + def _tokenize_formatted_example( example: Dict[str, Any], tokenizer: PreTrainedTokenizerBase) -> Dict[str, List[int]]: - if ('prompt' not in example) or ('response' not in example): + """Tokenize a formatted example and validate expected keys.""" + example_keys = set(example.keys()) + prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS) + response_keys = example_keys.intersection(_ALLOWED_RESPONSE_KEYS) + + if len(prompt_keys) != 1: + raise KeyError( + f'Unable to tokenize example because {len(prompt_keys)} of the allowed prompt keys ' +\ + f'were present in {example_keys=}. Please specify exactly one. {_ALLOWED_PROMPT_KEYS=}' + ) + + if len(response_keys) != 1: raise KeyError( - 'Unable to tokenize example because it has not been properly formatted. ' +\ - '"prompt" and "response" are required keys but at least one was missing ' +\ - f'from {example=}.' + f'Unable to tokenize example because {len(response_keys)} of the allowed response keys ' +\ + f'were present in {example_keys=}. Please specify exactly one. {_ALLOWED_RESPONSE_KEYS=}' ) - if not isinstance(example['prompt'], str): + + prompt_key = prompt_keys.pop() + response_key = response_keys.pop() + prompt = example[prompt_key] + response = example[response_key] + + if not isinstance(prompt, str): raise TypeError( - f'Unable to tokenize example because "prompt" was not a string. {example=}' + f'Unable to tokenize example because {prompt_key} was not a string. {example=}' ) - if not isinstance(example['response'], str): + + if not isinstance(response, str): raise TypeError( - f'Unable to tokenize example because "response" was not a string. {example=}' + f'Unable to tokenize example because {response_key} was not a string. {example=}' ) - return tokenizer(text=example['prompt'], text_target=example['response']) + + return tokenizer(text=prompt, text_target=response) class StreamingFinetuningDataset(StreamingDataset): diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index 1b43762473..51afb105c8 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -168,19 +168,11 @@ def parse_args() -> Namespace: return parser.parse_args() -def convert_composer_to_hf(args: Namespace) -> None: +def _convert_composer_to_hf(args: Namespace) -> None: print() print('#' * 30) print('Converting Composer checkpoint to HuggingFace checkpoint format...') - # Register MPT auto classes so that this script works with MPT - # This script will not work without modification for other custom models, - # but will work for other HuggingFace causal LMs - from transformers.models.auto.configuration_auto import CONFIG_MAPPING - CONFIG_MAPPING._extra_content['mpt'] = MPTConfig - MPTConfig.register_for_auto_class() - MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM') - _, _, local_folder_path = parse_uri(args.hf_output_path) config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint( @@ -296,5 +288,25 @@ def convert_composer_to_hf(args: Namespace) -> None: ) +def convert_composer_to_hf(args: Namespace) -> None: + # Register MPT auto classes so that this script works with MPT + # This script will not work without modification for other custom models, + # but will work for other HuggingFace causal LMs + from transformers.models.auto.configuration_auto import CONFIG_MAPPING + CONFIG_MAPPING._extra_content['mpt'] = MPTConfig + MPTConfig.register_for_auto_class() + MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM') + + try: + _convert_composer_to_hf(args) + except Exception as e: + raise e + finally: + # Undo auto registration after running the script + del CONFIG_MAPPING._extra_content['mpt'] + delattr(MPTConfig, '_auto_class') + delattr(MPTForCausalLM, '_auto_class') + + if __name__ == '__main__': convert_composer_to_hf(parse_args()) diff --git a/tests/a_scripts/__init__.py b/tests/a_scripts/__init__.py new file mode 100644 index 0000000000..eb5c1d149e --- /dev/null +++ b/tests/a_scripts/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +# TODO: This test directory is called "a_scripts" to enforce that these tests are run +# first. More clean up should be done to ensure tests can be run in any order and +# don't leave around artifacts diff --git a/tests/a_scripts/data_prep/__init__.py b/tests/a_scripts/data_prep/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/a_scripts/data_prep/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py new file mode 100644 index 0000000000..f226b0a4be --- /dev/null +++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py @@ -0,0 +1,28 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +from argparse import Namespace +from pathlib import Path + +from scripts.data_prep.convert_dataset_hf import main as main_hf + + +def test_download_script_from_api(tmp_path: Path): + # test calling it directly + path = os.path.join(tmp_path, 'my-copy-c4-1') + main_hf( + Namespace( + **{ + 'dataset': 'c4', + 'data_subset': 'en', + 'splits': ['val_xsmall'], + 'out_root': path, + 'compression': None, + 'concat_tokens': None, + 'bos_text': None, + 'eos_text': None, + 'no_wrap': False, + 'num_workers': None + })) + assert os.path.exists(path) diff --git a/tests/a_scripts/data_prep/test_convert_dataset_json.py b/tests/a_scripts/data_prep/test_convert_dataset_json.py new file mode 100644 index 0000000000..179b8a701b --- /dev/null +++ b/tests/a_scripts/data_prep/test_convert_dataset_json.py @@ -0,0 +1,27 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +from argparse import Namespace +from pathlib import Path + +from scripts.data_prep.convert_dataset_json import main as main_json + + +def test_json_script_from_api(tmp_path: Path): + # test calling it directly + path = os.path.join(tmp_path, 'my-copy-arxiv-1') + main_json( + Namespace( + **{ + 'path': 'scripts/data_prep/example_data/arxiv.jsonl', + 'out_root': path, + 'compression': None, + 'split': 'train', + 'concat_tokens': None, + 'bos_text': None, + 'eos_text': None, + 'no_wrap': False, + 'num_workers': None + })) + assert os.path.exists(path) diff --git a/tests/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py similarity index 98% rename from tests/test_convert_text_to_mds.py rename to tests/a_scripts/data_prep/test_convert_text_to_mds.py index ab8c25bc2d..cc293a2cdd 100644 --- a/tests/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -2,13 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import sys - -import pytest - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) import pathlib from concurrent.futures import ProcessPoolExecutor from glob import glob @@ -16,6 +9,7 @@ from unittest.mock import Mock, patch import numpy as np +import pytest from streaming import StreamingDataset from transformers import AutoTokenizer diff --git a/tests/a_scripts/eval/__init__.py b/tests/a_scripts/eval/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/a_scripts/eval/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_eval.py b/tests/a_scripts/eval/test_eval.py similarity index 89% rename from tests/test_eval.py rename to tests/a_scripts/eval/test_eval.py index 2fc96bb7ad..e8d86903dc 100644 --- a/tests/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -4,8 +4,7 @@ import copy import os import pathlib -import sys -from typing import Any +from typing import Any, Union import omegaconf as om import pytest @@ -14,15 +13,10 @@ from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils import build_tokenizer +from scripts.eval.eval import main # noqa: E402 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - -from scripts.eval.eval import main # noqa: E402 - @pytest.fixture(autouse=True) def set_correct_cwd(): @@ -35,11 +29,16 @@ def set_correct_cwd(): os.chdir('..') -@pytest.fixture() -def mock_saved_model_path(): - # load the eval and model config - with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f: +@pytest.fixture +def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]: + yaml_path = os.path.join(foundry_dir, 'scripts/eval/yamls/test_eval.yaml') + with open(yaml_path, 'r', encoding='utf-8') as f: eval_cfg = om.OmegaConf.load(f) + return eval_cfg + + +@pytest.fixture() +def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): model_cfg = eval_cfg.models[0] # set device to cpu device = 'cpu' @@ -60,12 +59,11 @@ def mock_saved_model_path(): os.remove(saved_model_path) -def test_icl_eval(capfd: Any, mock_saved_model_path: Any): - with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f: - test_cfg = om.OmegaConf.load(f) - test_cfg.models[0].load_path = mock_saved_model_path - assert isinstance(test_cfg, om.DictConfig) - main(test_cfg) +def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any, + mock_saved_model_path: Any): + eval_cfg.models[0].load_path = mock_saved_model_path + assert isinstance(eval_cfg, om.DictConfig) + main(eval_cfg) out, _ = capfd.readouterr() expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt |' assert expected_results in out diff --git a/tests/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py similarity index 86% rename from tests/test_eval_inputs.py rename to tests/a_scripts/eval/test_eval_inputs.py index 83104b62b7..8694546c4f 100644 --- a/tests/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import copy import os -import sys import warnings import omegaconf @@ -10,10 +9,6 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - from scripts.eval.eval import main # noqa: E402 @@ -21,10 +16,12 @@ class TestHuggingFaceEvalYAMLInputs: """Validate and tests error handling for the input YAML file.""" @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/eval/yamls/hf_eval.yaml') + conf_path: str = os.path.join( + foundry_dir, + 'scripts/eval/yamls/hf_eval.yaml', + ) with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) assert isinstance(test_cfg, DictConfig) @@ -78,15 +75,17 @@ def test_optional_mispelled_params_raise_warning(self, class TestMPTEvalYAMLInputs: @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/eval/yamls/mpt_eval.yaml') + conf_path: str = os.path.join( + foundry_dir, + 'scripts/eval/yamls/mpt_eval.yaml', + ) with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) test_cfg.icl_tasks[0].dataset_uri = os.path.join( - repo_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri) + foundry_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri) # make tests use cpu initialized transformer models only test_cfg.models[0].model.init_device = 'cpu' diff --git a/tests/a_scripts/inference/__init__.py b/tests/a_scripts/inference/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/a_scripts/inference/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_hf_conversion_script.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py similarity index 99% rename from tests/test_hf_conversion_script.py rename to tests/a_scripts/inference/test_convert_composer_to_hf.py index f9191cd701..d21c942dee 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -4,34 +4,26 @@ import math import os import pathlib -import sys -from typing import Callable -from unittest.mock import ANY, MagicMock, patch - -from composer import Trainer -from composer.loggers import MLFlowLogger -from composer.utils import dist, get_device, using_torch_2 - -from llmfoundry.callbacks import HuggingFaceCheckpointer -from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) import shutil from argparse import Namespace -from typing import Optional, cast +from typing import Callable, Optional, cast +from unittest.mock import ANY, MagicMock, patch import pytest import torch import transformers +from composer import Trainer +from composer.loggers import MLFlowLogger +from composer.utils import dist, get_device, using_torch_2 from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.utils.data import DataLoader from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry import COMPOSER_MODEL_REGISTRY +from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.data.finetuning import build_finetuning_dataloader +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM from llmfoundry.utils.builders import build_optimizer, build_tokenizer from scripts.inference.convert_composer_to_hf import convert_composer_to_hf from tests.data_utils import make_tiny_ft_dataset diff --git a/tests/a_scripts/train/__init__.py b/tests/a_scripts/train/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/a_scripts/train/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_training.py b/tests/a_scripts/train/test_train.py similarity index 90% rename from tests/test_training.py rename to tests/a_scripts/train/test_train.py index 3cd2963100..62075383cc 100644 --- a/tests/test_training.py +++ b/tests/a_scripts/train/test_train.py @@ -1,9 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 import copy -import os import pathlib -from typing import Any, Optional +from typing import Optional import pytest from composer.loggers import InMemoryLogger @@ -16,22 +15,10 @@ gpt_tiny_cfg) -@pytest.fixture(autouse=False) -def set_correct_cwd(): - if not os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('scripts') - - yield - - if os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('..') - - @pytest.mark.parametrize('averages', [{ 'core_average': ['language_understanding_lite'] }, None]) -def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, - tmp_path: pathlib.Path): +def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path): """Test training run with a small dataset.""" dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(dataset_name, 'cpu') @@ -40,7 +27,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, 'label': 'lambada_openai', 'dataset_uri': - 'eval/local_data/language_understanding/lambada_openai_small.jsonl', + 'scripts/eval/local_data/language_understanding/lambada_openai_small.jsonl', 'num_fewshot': [0], 'icl_task_type': 'language_modeling' @@ -110,7 +97,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, -1][-1] == 0 -def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path): +def test_train_multi_eval(tmp_path: pathlib.Path): """Test training run with multiple eval datasets.""" c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') diff --git a/tests/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py similarity index 96% rename from tests/test_train_inputs.py rename to tests/a_scripts/train/test_train_inputs.py index 2ed1c9c239..17eca26587 100644 --- a/tests/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -3,7 +3,6 @@ import copy import json import os -import sys import warnings import omegaconf @@ -11,10 +10,6 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - from scripts.train.train import main # noqa: E402 @@ -54,10 +49,10 @@ class TestTrainingYAMLInputs: """Validate and tests error handling for the input YAML file.""" @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" conf_path: str = os.path.join( - repo_dir, 'scripts/train/yamls/pretrain/testing.yaml') + foundry_dir, 'scripts/train/yamls/pretrain/testing.yaml') with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) assert isinstance(test_cfg, DictConfig) diff --git a/tests/callbacks/__init__.py b/tests/callbacks/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/callbacks/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_eval_gauntlet.py b/tests/callbacks/test_eval_gauntlet_callback.py similarity index 100% rename from tests/test_eval_gauntlet.py rename to tests/callbacks/test_eval_gauntlet_callback.py diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_dataloader.py b/tests/data/test_dataloader.py similarity index 94% rename from tests/test_dataloader.py rename to tests/data/test_dataloader.py index 2e9039644b..747021e82a 100644 --- a/tests/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -5,7 +5,6 @@ import pathlib import random import shutil -import sys import tempfile from argparse import Namespace from typing import Literal, Optional, Union @@ -22,14 +21,13 @@ from llmfoundry import (build_finetuning_dataloader, build_text_denoising_dataloader) from llmfoundry.data import build_dataloader +from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS, + _ALLOWED_RESPONSE_KEYS, + _tokenize_formatted_example) from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper, build_text_dataloader, get_tokens_per_batch_func) from llmfoundry.utils.builders import build_tokenizer - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) from scripts.data_prep.convert_dataset_hf import main as main_hf from tests.data_utils import make_tiny_ft_dataset @@ -360,10 +358,8 @@ def test_finetuning_dataloader_small_data(dataset_size: int, if (dist.get_world_size() * device_batch_size > dataset_size) and drop_last: error_context = pytest.raises(ValueError, match='Your dataset') if invalid_dataset: - error_context = pytest.raises( - TypeError, - match='Unable to tokenize example because "prompt" was not a string' - ) + error_context = pytest.raises(TypeError, + match='Unable to tokenize example') with error_context: _ = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) @@ -372,6 +368,39 @@ def test_finetuning_dataloader_small_data(dataset_size: int, shutil.rmtree(tiny_dataset_folder_path) +def test_tokenize_example_malformed(): + no_keys = {} + no_prompt_key = {'response': 'response'} + no_response_key = {'prompt': 'prompt'} + extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'} + extra_keys_with_response = {'response': 'response', 'extra': 'extra'} + multiple_allowed_response_keys = { + 'prompt': 'prompt', + 'response': 'response', + 'completion': 'completion' + } + + malformed_examples = [ + no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt, + extra_keys_with_response, multiple_allowed_response_keys + ] + + for example in malformed_examples: + with pytest.raises(KeyError): + _tokenize_formatted_example(example, MagicMock()) + + +def test_tokenize_example_well_formed(): + tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2') + + for prompt_key in _ALLOWED_PROMPT_KEYS: + for response_key in _ALLOWED_RESPONSE_KEYS: + example = {prompt_key: 'prompt', response_key: 'response'} + tokenized_example = _tokenize_formatted_example(example, tokenizer) + assert 'input_ids' in tokenized_example + assert 'labels' in tokenized_example + + @pytest.mark.parametrize('split', ['train', 'custom', 'data']) def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str): tokenizer_name = 'gpt2' diff --git a/tests/test_icl_datasets.py b/tests/data/test_icl_datasets.py similarity index 98% rename from tests/test_icl_datasets.py rename to tests/data/test_icl_datasets.py index 28d12df91d..3a730fdf19 100644 --- a/tests/test_icl_datasets.py +++ b/tests/data/test_icl_datasets.py @@ -10,7 +10,7 @@ from llmfoundry.utils.builders import build_icl_evaluators -def load_icl_config(conf_path: str = 'tests/test_tasks.yaml'): +def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'): with open(conf_path) as f: test_cfg = om.load(f) return test_cfg diff --git a/tests/test_packing.py b/tests/data/test_packing.py similarity index 100% rename from tests/test_packing.py rename to tests/data/test_packing.py diff --git a/tests/test_tasks.yaml b/tests/data/test_tasks.yaml similarity index 100% rename from tests/test_tasks.yaml rename to tests/data/test_tasks.yaml diff --git a/tests/data_utils.py b/tests/data_utils.py index efb4f6d7cf..a0ad6bcd13 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -1,14 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import os -import sys - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - import json +import os import pathlib import shutil from argparse import Namespace @@ -120,10 +114,14 @@ def create_arxiv_dataset(path: pathlib.Path) -> str: arxiv_dir = os.path.join(path, f'my-copy-arxiv') downloaded_split = 'train' + arxiv_path = 'data_prep/example_data/arxiv.jsonl' + if not os.getcwd().endswith('scripts'): + arxiv_path = os.path.join('scripts', arxiv_path) + main_json( Namespace( **{ - 'path': 'data_prep/example_data/arxiv.jsonl', + 'path': arxiv_path, 'out_root': arxiv_dir, 'compression': None, 'split': downloaded_split, @@ -139,8 +137,11 @@ def create_arxiv_dataset(path: pathlib.Path) -> str: def gpt_tiny_cfg(dataset_name: str, device: str): """Create gpt tiny cfg.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/train/yamls/pretrain/testing.yaml') + from tests.fixtures.autouse import REPO_DIR + conf_path: str = os.path.join( + REPO_DIR, + 'scripts/train/yamls/pretrain/testing.yaml', + ) with open(conf_path) as f: test_cfg = om.load(f) assert isinstance(test_cfg, DictConfig) diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py index c51ccfacb0..75caa6c941 100644 --- a/tests/fixtures/autouse.py +++ b/tests/fixtures/autouse.py @@ -2,11 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import gc +import os +import sys import pytest import torch from composer.utils import dist, get_device, reproducibility +# Add llm-foundry repo root to path so we can import scripts in the tests +REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) +sys.path.append(REPO_DIR) + @pytest.fixture(autouse=True) def initialize_dist(request: pytest.FixtureRequest): @@ -33,6 +39,11 @@ def random_seed() -> int: return 17 +@pytest.fixture +def foundry_dir() -> str: + return REPO_DIR + + @pytest.fixture(autouse=True) def seed_all(random_seed: int): """Sets the seed for reproducibility.""" diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/models/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/models/hf/__init__.py b/tests/models/hf/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/models/hf/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_hf_config.py b/tests/models/hf/test_hf_config.py similarity index 100% rename from tests/test_hf_config.py rename to tests/models/hf/test_hf_config.py diff --git a/tests/test_hf_mpt_gen.py b/tests/models/hf/test_hf_mpt_gen.py similarity index 100% rename from tests/test_hf_mpt_gen.py rename to tests/models/hf/test_hf_mpt_gen.py diff --git a/tests/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py similarity index 100% rename from tests/test_hf_v_mpt.py rename to tests/models/hf/test_hf_v_mpt.py diff --git a/tests/models/inference_api_wrapper/__init__.py b/tests/models/inference_api_wrapper/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/models/inference_api_wrapper/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py similarity index 100% rename from tests/test_inference_api_eval_wrapper.py rename to tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py diff --git a/tests/models/layers/__init__.py b/tests/models/layers/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/models/layers/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py similarity index 100% rename from tests/test_flash_triton_torch.py rename to tests/models/layers/test_flash_triton_torch.py diff --git a/tests/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py similarity index 100% rename from tests/test_huggingface_flash.py rename to tests/models/layers/test_huggingface_flash.py diff --git a/tests/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py similarity index 100% rename from tests/test_fsdp_act_checkpoint.py rename to tests/models/test_fsdp_act_checkpoint.py diff --git a/tests/test_model.py b/tests/models/test_model.py similarity index 100% rename from tests/test_model.py rename to tests/models/test_model.py diff --git a/tests/test_mpt_gen.py b/tests/models/test_mpt_gen.py similarity index 100% rename from tests/test_mpt_gen.py rename to tests/models/test_mpt_gen.py diff --git a/tests/test_onnx.py b/tests/models/test_onnx.py similarity index 100% rename from tests/test_onnx.py rename to tests/models/test_onnx.py diff --git a/tests/test_rope_dail_vs_hf.py b/tests/models/test_rope_dail_vs_hf.py similarity index 100% rename from tests/test_rope_dail_vs_hf.py rename to tests/models/test_rope_dail_vs_hf.py diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/models/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_init_fn.py b/tests/models/utils/test_param_init_fns.py similarity index 100% rename from tests/test_init_fn.py rename to tests/models/utils/test_param_init_fns.py diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/optim/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_lion8b.py b/tests/optim/test_lion8b.py similarity index 100% rename from tests/test_lion8b.py rename to tests/optim/test_lion8b.py diff --git a/tests/test_scheduler.py b/tests/optim/test_scheduler.py similarity index 100% rename from tests/test_scheduler.py rename to tests/optim/test_scheduler.py diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py deleted file mode 100644 index 4fe5ed7e64..0000000000 --- a/tests/test_data_prep_scripts.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 - -import os -import sys -from argparse import Namespace -from pathlib import Path - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) -from scripts.data_prep.convert_dataset_hf import main as main_hf -from scripts.data_prep.convert_dataset_json import main as main_json - - -def test_download_script_from_api(tmp_path: Path): - # test calling it directly - path = os.path.join(tmp_path, 'my-copy-c4-1') - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': ['val_xsmall'], - 'out_root': path, - 'compression': None, - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None - })) - assert os.path.exists(path) - - -def test_json_script_from_api(tmp_path: Path): - # test calling it directly - path = os.path.join(tmp_path, 'my-copy-arxiv-1') - main_json( - Namespace( - **{ - 'path': 'scripts/data_prep/example_data/arxiv.jsonl', - 'out_root': path, - 'compression': None, - 'split': 'train', - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None - })) - assert os.path.exists(path) diff --git a/tests/tokenizers/__init__.py b/tests/tokenizers/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/tokenizers/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py similarity index 99% rename from tests/test_tiktoken.py rename to tests/tokenizers/test_tiktoken.py index fe3db41d50..60907092c8 100644 --- a/tests/test_tiktoken.py +++ b/tests/tokenizers/test_tiktoken.py @@ -9,8 +9,9 @@ from llmfoundry.tokenizers.tiktoken import (TiktokenTokenizerWrapper, bytes_to_unicode) +from tests.a_scripts.inference.test_convert_composer_to_hf import \ + check_hf_tokenizer_equivalence from tests.horrible_strings import HORRIBLE_STRINGS -from tests.test_hf_conversion_script import check_hf_tokenizer_equivalence if TYPE_CHECKING: from tiktoken.core import Encoding diff --git a/tests/test_tokenizer.py b/tests/tokenizers/test_tokenizer.py similarity index 100% rename from tests/test_tokenizer.py rename to tests/tokenizers/test_tokenizer.py diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000000..f6c1f9f3ab --- /dev/null +++ b/tests/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_builders.py b/tests/utils/test_builders.py similarity index 100% rename from tests/test_builders.py rename to tests/utils/test_builders.py diff --git a/tests/test_model_download_utils.py b/tests/utils/test_model_download_utils.py similarity index 100% rename from tests/test_model_download_utils.py rename to tests/utils/test_model_download_utils.py diff --git a/tests/test_prompt_files.py b/tests/utils/test_prompt_files.py similarity index 100% rename from tests/test_prompt_files.py rename to tests/utils/test_prompt_files.py