diff --git a/scripts/eval/yamls/test_eval.yaml b/scripts/eval/yamls/test_eval.yaml index 3b8a6126a8..83d04bf321 100644 --- a/scripts/eval/yamls/test_eval.yaml +++ b/scripts/eval/yamls/test_eval.yaml @@ -25,7 +25,7 @@ device_eval_batch_size: 4 icl_subset_num_batches: 1 icl_tasks: - label: lambada_openai - dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl + dataset_uri: scripts/eval/local_data/language_understanding/lambada_openai.jsonl num_fewshot: [0] icl_task_type: language_modeling eval_gauntlet: diff --git a/tests/callbacks/__init__.py b/tests/callbacks/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/callbacks/__init__.py +++ b/tests/callbacks/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 2e9039644b..0f5f506e22 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -5,7 +5,6 @@ import pathlib import random import shutil -import sys import tempfile from argparse import Namespace from typing import Literal, Optional, Union @@ -26,10 +25,6 @@ build_text_dataloader, get_tokens_per_batch_func) from llmfoundry.utils.builders import build_tokenizer - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) from scripts.data_prep.convert_dataset_hf import main as main_hf from tests.data_utils import make_tiny_ft_dataset diff --git a/tests/data/test_icl_datasets.py b/tests/data/test_icl_datasets.py index 28d12df91d..3a730fdf19 100644 --- a/tests/data/test_icl_datasets.py +++ b/tests/data/test_icl_datasets.py @@ -10,7 +10,7 @@ from llmfoundry.utils.builders import build_icl_evaluators -def load_icl_config(conf_path: str = 'tests/test_tasks.yaml'): +def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'): with open(conf_path) as f: test_cfg = om.load(f) return test_cfg diff --git a/tests/test_tasks.yaml b/tests/data/test_tasks.yaml similarity index 100% rename from tests/test_tasks.yaml rename to tests/data/test_tasks.yaml diff --git a/tests/data_utils.py b/tests/data_utils.py index efb4f6d7cf..182fd4e0cd 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -1,14 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import os -import sys - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - import json +import os import pathlib import shutil from argparse import Namespace @@ -139,8 +133,11 @@ def create_arxiv_dataset(path: pathlib.Path) -> str: def gpt_tiny_cfg(dataset_name: str, device: str): """Create gpt tiny cfg.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/train/yamls/pretrain/testing.yaml') + from tests.fixtures.autouse import REPO_DIR + conf_path: str = os.path.join( + REPO_DIR, + 'scripts/train/yamls/pretrain/testing.yaml', + ) with open(conf_path) as f: test_cfg = om.load(f) assert isinstance(test_cfg, DictConfig) diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py index c51ccfacb0..75caa6c941 100644 --- a/tests/fixtures/autouse.py +++ b/tests/fixtures/autouse.py @@ -2,11 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import gc +import os +import sys import pytest import torch from composer.utils import dist, get_device, reproducibility +# Add llm-foundry repo root to path so we can import scripts in the tests +REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) +sys.path.append(REPO_DIR) + @pytest.fixture(autouse=True) def initialize_dist(request: pytest.FixtureRequest): @@ -33,6 +39,11 @@ def random_seed() -> int: return 17 +@pytest.fixture +def foundry_dir() -> str: + return REPO_DIR + + @pytest.fixture(autouse=True) def seed_all(random_seed: int): """Sets the seed for reproducibility.""" diff --git a/tests/models/__init__.py b/tests/models/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/models/__init__.py +++ b/tests/models/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/models/hf/__init__.py b/tests/models/hf/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/models/hf/__init__.py +++ b/tests/models/hf/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/models/inference_api_wrapper/__init__.py b/tests/models/inference_api_wrapper/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/models/inference_api_wrapper/__init__.py +++ b/tests/models/inference_api_wrapper/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/models/layers/__init__.py b/tests/models/layers/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/models/layers/__init__.py +++ b/tests/models/layers/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/models/utils/__init__.py b/tests/models/utils/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/models/utils/__init__.py +++ b/tests/models/utils/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/optim/__init__.py +++ b/tests/optim/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/__init__.py b/tests/scripts/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/scripts/__init__.py +++ b/tests/scripts/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/data_prep/__init__.py b/tests/scripts/data_prep/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/scripts/data_prep/__init__.py +++ b/tests/scripts/data_prep/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/data_prep/test_convert_dataset_hf.py b/tests/scripts/data_prep/test_convert_dataset_hf.py index 2413d486b0..f226b0a4be 100644 --- a/tests/scripts/data_prep/test_convert_dataset_hf.py +++ b/tests/scripts/data_prep/test_convert_dataset_hf.py @@ -2,13 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import os -import sys from argparse import Namespace from pathlib import Path -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) from scripts.data_prep.convert_dataset_hf import main as main_hf diff --git a/tests/scripts/data_prep/test_convert_dataset_json.py b/tests/scripts/data_prep/test_convert_dataset_json.py index fa65608e1f..179b8a701b 100644 --- a/tests/scripts/data_prep/test_convert_dataset_json.py +++ b/tests/scripts/data_prep/test_convert_dataset_json.py @@ -2,14 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import os -import sys from argparse import Namespace from pathlib import Path -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - from scripts.data_prep.convert_dataset_json import main as main_json diff --git a/tests/scripts/data_prep/test_convert_text_to_mds.py b/tests/scripts/data_prep/test_convert_text_to_mds.py index ab8c25bc2d..cc293a2cdd 100644 --- a/tests/scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/scripts/data_prep/test_convert_text_to_mds.py @@ -2,13 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import sys - -import pytest - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) import pathlib from concurrent.futures import ProcessPoolExecutor from glob import glob @@ -16,6 +9,7 @@ from unittest.mock import Mock, patch import numpy as np +import pytest from streaming import StreamingDataset from transformers import AutoTokenizer diff --git a/tests/scripts/eval/__init__.py b/tests/scripts/eval/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/scripts/eval/__init__.py +++ b/tests/scripts/eval/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/eval/test_eval.py b/tests/scripts/eval/test_eval.py index 2fc96bb7ad..df159ad8ed 100644 --- a/tests/scripts/eval/test_eval.py +++ b/tests/scripts/eval/test_eval.py @@ -4,8 +4,7 @@ import copy import os import pathlib -import sys -from typing import Any +from typing import Any, Union import omegaconf as om import pytest @@ -14,32 +13,21 @@ from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils import build_tokenizer +from scripts.eval.eval import main # noqa: E402 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - -from scripts.eval.eval import main # noqa: E402 - -@pytest.fixture(autouse=True) -def set_correct_cwd(): - if not os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('scripts') - - yield - - if os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('..') +@pytest.fixture +def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]: + yaml_path = os.path.join(foundry_dir, 'scripts/eval/yamls/test_eval.yaml') + with open(yaml_path, 'r', encoding='utf-8') as f: + eval_cfg = om.OmegaConf.load(f) + return eval_cfg @pytest.fixture() -def mock_saved_model_path(): - # load the eval and model config - with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f: - eval_cfg = om.OmegaConf.load(f) +def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): model_cfg = eval_cfg.models[0] # set device to cpu device = 'cpu' @@ -60,12 +48,11 @@ def mock_saved_model_path(): os.remove(saved_model_path) -def test_icl_eval(capfd: Any, mock_saved_model_path: Any): - with open('eval/yamls/test_eval.yaml', 'r', encoding='utf-8') as f: - test_cfg = om.OmegaConf.load(f) - test_cfg.models[0].load_path = mock_saved_model_path - assert isinstance(test_cfg, om.DictConfig) - main(test_cfg) +def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any, + mock_saved_model_path: Any): + eval_cfg.models[0].load_path = mock_saved_model_path + assert isinstance(eval_cfg, om.DictConfig) + main(eval_cfg) out, _ = capfd.readouterr() expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt |' assert expected_results in out diff --git a/tests/scripts/eval/test_eval_inputs.py b/tests/scripts/eval/test_eval_inputs.py index 83104b62b7..8694546c4f 100644 --- a/tests/scripts/eval/test_eval_inputs.py +++ b/tests/scripts/eval/test_eval_inputs.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import copy import os -import sys import warnings import omegaconf @@ -10,10 +9,6 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - from scripts.eval.eval import main # noqa: E402 @@ -21,10 +16,12 @@ class TestHuggingFaceEvalYAMLInputs: """Validate and tests error handling for the input YAML file.""" @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/eval/yamls/hf_eval.yaml') + conf_path: str = os.path.join( + foundry_dir, + 'scripts/eval/yamls/hf_eval.yaml', + ) with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) assert isinstance(test_cfg, DictConfig) @@ -78,15 +75,17 @@ def test_optional_mispelled_params_raise_warning(self, class TestMPTEvalYAMLInputs: @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/eval/yamls/mpt_eval.yaml') + conf_path: str = os.path.join( + foundry_dir, + 'scripts/eval/yamls/mpt_eval.yaml', + ) with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) test_cfg.icl_tasks[0].dataset_uri = os.path.join( - repo_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri) + foundry_dir, 'scripts', test_cfg.icl_tasks[0].dataset_uri) # make tests use cpu initialized transformer models only test_cfg.models[0].model.init_device = 'cpu' diff --git a/tests/scripts/inference/__init__.py b/tests/scripts/inference/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/scripts/inference/__init__.py +++ b/tests/scripts/inference/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/inference/test_convert_composer_to_hf.py b/tests/scripts/inference/test_convert_composer_to_hf.py index f9191cd701..d21c942dee 100644 --- a/tests/scripts/inference/test_convert_composer_to_hf.py +++ b/tests/scripts/inference/test_convert_composer_to_hf.py @@ -4,34 +4,26 @@ import math import os import pathlib -import sys -from typing import Callable -from unittest.mock import ANY, MagicMock, patch - -from composer import Trainer -from composer.loggers import MLFlowLogger -from composer.utils import dist, get_device, using_torch_2 - -from llmfoundry.callbacks import HuggingFaceCheckpointer -from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM - -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) import shutil from argparse import Namespace -from typing import Optional, cast +from typing import Callable, Optional, cast +from unittest.mock import ANY, MagicMock, patch import pytest import torch import transformers +from composer import Trainer +from composer.loggers import MLFlowLogger +from composer.utils import dist, get_device, using_torch_2 from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.utils.data import DataLoader from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry import COMPOSER_MODEL_REGISTRY +from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.data.finetuning import build_finetuning_dataloader +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM from llmfoundry.utils.builders import build_optimizer, build_tokenizer from scripts.inference.convert_composer_to_hf import convert_composer_to_hf from tests.data_utils import make_tiny_ft_dataset diff --git a/tests/scripts/train/__init__.py b/tests/scripts/train/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/scripts/train/__init__.py +++ b/tests/scripts/train/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/scripts/train/test_train.py b/tests/scripts/train/test_train.py index 3cd2963100..37c3a15abb 100644 --- a/tests/scripts/train/test_train.py +++ b/tests/scripts/train/test_train.py @@ -1,7 +1,6 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 import copy -import os import pathlib from typing import Any, Optional @@ -16,17 +15,6 @@ gpt_tiny_cfg) -@pytest.fixture(autouse=False) -def set_correct_cwd(): - if not os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('scripts') - - yield - - if os.getcwd().endswith('llm-foundry/scripts'): - os.chdir('..') - - @pytest.mark.parametrize('averages', [{ 'core_average': ['language_understanding_lite'] }, None]) diff --git a/tests/scripts/train/test_train_inputs.py b/tests/scripts/train/test_train_inputs.py index 2ed1c9c239..17eca26587 100644 --- a/tests/scripts/train/test_train_inputs.py +++ b/tests/scripts/train/test_train_inputs.py @@ -3,7 +3,6 @@ import copy import json import os -import sys import warnings import omegaconf @@ -11,10 +10,6 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - from scripts.train.train import main # noqa: E402 @@ -54,10 +49,10 @@ class TestTrainingYAMLInputs: """Validate and tests error handling for the input YAML file.""" @pytest.fixture - def cfg(self) -> DictConfig: + def cfg(self, foundry_dir: str) -> DictConfig: """Create YAML cfg fixture for testing purposes.""" conf_path: str = os.path.join( - repo_dir, 'scripts/train/yamls/pretrain/testing.yaml') + foundry_dir, 'scripts/train/yamls/pretrain/testing.yaml') with open(conf_path, 'r', encoding='utf-8') as config: test_cfg = om.load(config) assert isinstance(test_cfg, DictConfig) diff --git a/tests/tokenizers/__init__.py b/tests/tokenizers/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/tokenizers/__init__.py +++ b/tests/tokenizers/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py index 05d33100f5..f6c1f9f3ab 100644 --- a/tests/utils/__init__.py +++ b/tests/utils/__init__.py @@ -1,3 +1,2 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -