From 3100859905c1ed29e049e7c203cf70da8231f2e6 Mon Sep 17 00:00:00 2001 From: Anna Date: Thu, 30 Nov 2023 14:02:13 -0800 Subject: [PATCH] Add eval loader to eval script (#742) * Add eval loader to eval script * small input tests * updates * fix typing and formatting * fixes, add tests * remove circular dependency * tests pass * nits + small fixes * add metrics at the end, refactor to put icl/gauntlet as helpers * NOT * metrics instead of models, add unit tests --- llmfoundry/data/dataloader.py | 32 ++++----- llmfoundry/utils/builders.py | 81 +++++++++++++++++++++++ scripts/eval/eval.py | 53 +++++++++++---- scripts/train/train.py | 52 +++++---------- tests/data_utils.py | 98 +++++++++++++++++++++++++++- tests/test_builders.py | 118 +++++++++++++++++++++++++++++++++- tests/test_dataloader.py | 11 ++++ tests/test_eval.py | 89 +++++++++++++++++++++++++ tests/test_eval_inputs.py | 1 + tests/test_train_inputs.py | 2 +- tests/test_training.py | 97 ++-------------------------- 11 files changed, 469 insertions(+), 165 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 12741717be..63d47a65d5 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -11,6 +11,12 @@ from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader from llmfoundry.data.text_data import build_text_dataloader +LOADER_NAME_TO_FUNCTION = { + 'text': build_text_dataloader, + 'text_denoising': build_text_denoising_dataloader, + 'finetuning': build_finetuning_dataloader, +} + def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int) -> DataSpec: @@ -22,23 +28,9 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. """ - if cfg.name == 'text': - return build_text_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'text_denoising': - return build_text_denoising_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'finetuning': - return build_finetuning_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - else: - raise ValueError(f'Not sure how to build dataloader with config: {cfg}') + if cfg.name not in LOADER_NAME_TO_FUNCTION: + allowed = ', '.join(LOADER_NAME_TO_FUNCTION.keys()) + raise ValueError(f'Expected dataloader name to be one of {allowed}' + + f' but found name "{cfg.name}" in config: {cfg}') + + return LOADER_NAME_TO_FUNCTION[cfg.name](cfg, tokenizer, device_batch_size) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 14196c3ef9..a672fbee55 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -28,12 +28,14 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from torch.optim.optimizer import Optimizer +from torchmetrics import Metric from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, GlobalLRScaling, HuggingFaceCheckpointer, LayerFreezing, MonolithicCheckpointSaver, ScheduledGarbageCollector) +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion, DecoupledLionW, DecoupledLionW_8bit) from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler @@ -42,6 +44,85 @@ log = logging.getLogger(__name__) +def build_evaluators( + eval_loader_config: Optional[Union[DictConfig, ListConfig]], + icl_tasks_config: Optional[Union[str, ListConfig]], + eval_gauntlet_config: Optional[Union[str, DictConfig]], + *, + tokenizer: PreTrainedTokenizerBase, + device_eval_batch_size: int, + icl_seq_len: int, + icl_subset_num_batches: Optional[int], +) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: + + evaluators = [] + if eval_loader_config is not None: + evaluators = build_eval_loaders( + eval_loader_config, + tokenizer, + device_eval_batch_size, + ) + + logger_keys = [] + eval_gauntlet_callback = None + if icl_tasks_config is not None: + icl_evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet( + icl_tasks_config, + eval_gauntlet_config, + tokenizer, + device_eval_batch_size, + icl_seq_len, + icl_subset_num_batches, + ) + evaluators.extend(icl_evaluators) + + return evaluators, logger_keys, eval_gauntlet_callback + + +def build_eval_loaders( + eval_loader_config: Union[DictConfig, ListConfig], + tokenizer: PreTrainedTokenizerBase, + device_eval_batch_size: int, +) -> List[Evaluator]: + evaluators: List[Evaluator] = [] + if isinstance(eval_loader_config, ListConfig): + eval_configs: ListConfig = eval_loader_config + is_multi_eval = True + else: + eval_configs = ListConfig([eval_loader_config]) + is_multi_eval = False + + for eval_config in eval_configs: + eval_dataloader = build_dataloader(eval_config, tokenizer, + device_eval_batch_size) + eval_loader: Evaluator = Evaluator( + label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', + dataloader=eval_dataloader, + # Load the eval data to fail fast. metrics will get added + # later in add_metrics_to_eval_loaders, after the model is loaded + metric_names=[], + ) + evaluators.append(eval_loader) + return evaluators + + +def add_metrics_to_eval_loaders( + evaluators: List[Evaluator], + metrics: Dict[str, Metric], +) -> List[Evaluator]: + metric_names = list(metrics.keys()) + eval_loaders, other_evaluators = [], [] + for evaluator in evaluators: + if evaluator.metric_names == []: + evaluator.metric_names = metric_names + eval_loaders.append(evaluator) + else: + other_evaluators.append(evaluator) + + # Put the base eval_loaders first + return eval_loaders + other_evaluators + + def build_icl_data_and_gauntlet( icl_tasks_config: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]], diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 02a5d1f862..369a894720 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -6,7 +6,7 @@ import sys import time import warnings -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd import torch @@ -21,13 +21,14 @@ from llmfoundry.models import MPTForCausalLM from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY -from llmfoundry.utils.builders import (build_icl_data_and_gauntlet, - build_logger, build_tokenizer) +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_evaluators, build_logger, + build_tokenizer) from llmfoundry.utils.config_utils import pop_config, process_init_device def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - num_retries: int) -> Optional[ComposerModel]: + num_retries: int) -> ComposerModel: try: from peft import PeftModel except ImportError as e: @@ -43,7 +44,8 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, } retries = 0 - while retries < num_retries: + composer_model_wrapper = None + while retries < num_retries and composer_model_wrapper is None: try: trust_remote_code = model_cfg.get('trust_remote_code', True) use_auth_token = model_cfg.get('use_auth_token', False) @@ -58,7 +60,6 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, composer_model_wrapper = COMPOSER_MODEL_REGISTRY[model_cfg.name]( peft_model, tokenizer) - return composer_model_wrapper except Exception as e: retries += 1 if retries >= num_retries: @@ -68,19 +69,21 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) + assert composer_model_wrapper is not None + return composer_model_wrapper + def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - fsdp_config: Optional[Dict], - num_retries: int) -> Optional[ComposerModel]: + fsdp_config: Optional[Dict], num_retries: int) -> ComposerModel: init_context = process_init_device(model_cfg, fsdp_config) retries = 0 + composer_model = None with init_context: - while retries < num_retries: + while retries < num_retries and composer_model is None: try: composer_model = COMPOSER_MODEL_REGISTRY[model_cfg.name]( model_cfg, tokenizer) - return composer_model except Exception as e: retries += 1 if retries >= num_retries: @@ -90,6 +93,9 @@ def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) + assert composer_model is not None + return composer_model + def evaluate_model( model_cfg: DictConfig, @@ -100,6 +106,7 @@ def evaluate_model( max_seq_len: int, device_eval_batch_size: int, eval_gauntlet_config: Optional[Union[str, DictConfig]], + eval_loader_config: Optional[Union[DictConfig, ListConfig]], fsdp_config: Optional[Dict], num_retries: int, loggers_cfg: Dict[str, Any], @@ -118,9 +125,15 @@ def evaluate_model( tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size, - max_seq_len, icl_subset_num_batches) + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=max_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) callbacks = [] if eval_gauntlet_callback is not None: @@ -143,6 +156,11 @@ def evaluate_model( composer_model = load_model(model_cfg.model, tokenizer, fsdp_config, num_retries) + # Now add the eval metrics + if eval_loader_config is not None: + train_metrics = composer_model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) + if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( columns=['model_name'] + @@ -186,7 +204,7 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) -def main(cfg: DictConfig): +def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: om.resolve(cfg) model_configs: ListConfig = pop_config(cfg, 'models', must_exist=True) eval_gauntlet_config: Optional[Union[str, DictConfig]] = pop_config( @@ -228,6 +246,8 @@ def main(cfg: DictConfig): default_value='debug') # Optional Evaluation Parameters with default values + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( + cfg, 'eval_loader', must_exist=False, default_value=None) seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17) dist_timeout: Union[float, int] = pop_config(cfg, 'dist_timeout', @@ -274,6 +294,7 @@ def main(cfg: DictConfig): eval_gauntlet_df = None models_df = None composite_scores = None + trainers = [] for model_cfg in model_configs: (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) = evaluate_model( @@ -285,6 +306,7 @@ def main(cfg: DictConfig): max_seq_len=max_seq_len, device_eval_batch_size=device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, + eval_loader_config=eval_loader_config, fsdp_config=fsdp_config, num_retries=num_retries, loggers_cfg=loggers_cfg, @@ -292,6 +314,7 @@ def main(cfg: DictConfig): precision=precision, eval_gauntlet_df=eval_gauntlet_df, icl_subset_num_batches=icl_subset_num_batches) + trainers.append(trainer) if eval_gauntlet_callback is not None: composite_scores = eval_gauntlet_callback.eval_after_all( @@ -330,6 +353,8 @@ def main(cfg: DictConfig): assert models_df is not None print(models_df.to_markdown(index=False)) + return trainers, eval_gauntlet_df + def calculate_markdown_results(logger_keys: List[str], trainer: Trainer, benchmark_to_taxonomy: Dict[str, str], diff --git a/scripts/train/train.py b/scripts/train/train.py index 88f776375f..809f2fb09c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -11,7 +11,6 @@ import torch from composer import Trainer -from composer.core import Evaluator from composer.core.callback import Callback from composer.loggers import MosaicMLLogger from composer.loggers.mosaicml_logger import (MOSAICML_ACCESS_TOKEN_ENV_VAR, @@ -26,10 +25,11 @@ from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, MPTForCausalLM) from llmfoundry.data.dataloader import build_dataloader -from llmfoundry.utils.builders import (build_algorithm, build_callback, - build_icl_data_and_gauntlet, - build_logger, build_optimizer, - build_scheduler, build_tokenizer) +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_algorithm, build_callback, + build_evaluators, build_logger, + build_optimizer, build_scheduler, + build_tokenizer) from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, update_batch_size_info) @@ -526,31 +526,16 @@ def main(cfg: DictConfig) -> Trainer: ## Evaluation print('Building eval loader...') - evaluators = [] - eval_loaders = [] - if eval_loader_config is not None: - is_multi_eval = isinstance(eval_loader_config, ListConfig) - eval_configs = eval_loader_config if is_multi_eval else [ - eval_loader_config - ] - for eval_config in eval_configs: - eval_dataloader = build_dataloader(eval_config, tokenizer, - device_eval_batch_size) - eval_loader = Evaluator( - label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', - dataloader=eval_dataloader, - metric_names=[], # we will add these after model is created - ) - eval_loaders.append(eval_loader) - - eval_gauntlet_callback = None - - if icl_tasks_config is not None: - icl_evaluators, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_tasks_config, eval_gauntlet_config, tokenizer, - device_eval_batch_size, icl_seq_len if icl_seq_len else max_seq_len, - icl_subset_num_batches) - evaluators.extend(icl_evaluators) + eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len + evaluators, _, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks_config, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=eval_icl_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) if eval_gauntlet_callback is not None: callbacks.append(eval_gauntlet_callback) @@ -581,11 +566,8 @@ def main(cfg: DictConfig) -> Trainer: # Now add the eval metrics if eval_loader_config is not None: - assert model.train_metrics is not None - eval_metric_names = list(model.train_metrics.keys()) - for eval_loader in eval_loaders: - eval_loader.metric_names = eval_metric_names - evaluators.insert(0, eval_loader) # Put the base eval_loaders first + train_metrics = model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) # Build the Trainer print('Building trainer...') diff --git a/tests/data_utils.py b/tests/data_utils.py index 075933de7d..efb4f6d7cf 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -1,10 +1,26 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import json import os +import sys + +# Add repo root to path so we can import scripts and test it +repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(repo_dir) + +import json +import pathlib +import shutil +from argparse import Namespace from typing import Optional +from omegaconf import DictConfig +from omegaconf import OmegaConf as om + +from scripts.data_prep.convert_dataset_hf import main as main_hf # noqa: E402 +from scripts.data_prep.convert_dataset_json import \ + main as main_json # noqa: E402 + def make_tiny_ft_dataset( path: str, @@ -65,3 +81,83 @@ def make_tiny_ft_dataset( for sample in samples: _f.write(json.dumps(sample)) _f.write('\n') + + +def create_c4_dataset_xxsmall(path: pathlib.Path) -> str: + """Creates a small mocked version of the C4 dataset.""" + c4_dir = os.path.join(path, f'my-copy-c4') + downloaded_split = 'val_xxsmall' # very fast to convert + + # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 + main_hf( + Namespace( + **{ + 'dataset': 'c4', + 'data_subset': 'en', + 'splits': [downloaded_split], + 'out_root': c4_dir, + 'compression': None, + 'concat_tokens': 2048, + 'tokenizer': 'EleutherAI/gpt-neox-20b', + 'tokenizer_kwargs': {}, + 'bos_text': '', + 'eos_text': '<|endoftext|>', + 'no_wrap': False, + 'num_workers': 8 + })) + + # copy the small downloaded_split to other c4 splits for mocking purposes + mocked_splits = ['train', 'val'] + for mocked_split in mocked_splits: + shutil.copytree(os.path.join(c4_dir, 'val_xxsmall'), + os.path.join(c4_dir, mocked_split)) + assert os.path.exists(c4_dir) + return c4_dir + + +def create_arxiv_dataset(path: pathlib.Path) -> str: + """Creates an arxiv dataset.""" + arxiv_dir = os.path.join(path, f'my-copy-arxiv') + downloaded_split = 'train' + + main_json( + Namespace( + **{ + 'path': 'data_prep/example_data/arxiv.jsonl', + 'out_root': arxiv_dir, + 'compression': None, + 'split': downloaded_split, + 'concat_tokens': None, + 'bos_text': None, + 'eos_text': None, + 'no_wrap': False, + 'num_workers': None + })) + + return arxiv_dir + + +def gpt_tiny_cfg(dataset_name: str, device: str): + """Create gpt tiny cfg.""" + conf_path: str = os.path.join(repo_dir, + 'scripts/train/yamls/pretrain/testing.yaml') + with open(conf_path) as f: + test_cfg = om.load(f) + assert isinstance(test_cfg, DictConfig) + + test_cfg.data_local = dataset_name + test_cfg.global_train_batch_size = 8 + test_cfg.device_eval_batch_size = 4 + test_cfg.device_train_microbatch_size = 4 + test_cfg.max_duration = '4ba' + test_cfg.eval_interval = '4ba' + test_cfg.run_name = 'gpt-mini-integration-test' + + if device == 'cpu': + test_cfg.model.init_device = 'cpu' + test_cfg.fsdp_config = None + test_cfg.model.attn_config.attn_impl = 'torch' + test_cfg.model.loss_fn = 'torch_crossentropy' + test_cfg.precision = 'fp32' + + return test_cfg diff --git a/tests/test_builders.py b/tests/test_builders.py index 7ac179720e..5c38ed8602 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -5,17 +5,22 @@ import unittest.mock as mock from copy import deepcopy from typing import Any, Dict, Union +from unittest.mock import MagicMock import pytest import torch import torch.nn as nn from composer.callbacks import Generate +from composer.core import Evaluator +from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from transformers import PreTrainedTokenizerBase from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.builders import (build_callback, build_optimizer, +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_callback, build_eval_loaders, + build_evaluators, build_optimizer, build_tokenizer) @@ -195,3 +200,114 @@ def test_build_optimizer(name: str, optimizer_config: Dict[str, Any], for n, p in model.named_parameters(): if re.search(param_str_match, n): assert id(p) in param_ids + + +def test_build_evaluators_empty(): + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + None, + None, + None, + tokenizer=None, # type: ignore + device_eval_batch_size=1, + icl_seq_len=2, + icl_subset_num_batches=3) + assert evaluators == [] + assert logger_keys == [] + assert eval_gauntlet_callback is None + + +def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): + tokenizer = TiktokenTokenizerWrapper(model_name='gpt-4') + + eval_loader_cfg = DictConfig({ + 'name': 'text', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + }) + monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', + lambda *args, **kwargs: MagicMock()) + eval_loaders = build_eval_loaders(eval_loader_cfg, tokenizer, 2) + + assert len(eval_loaders) == 1 + + assert eval_loaders[0].label == 'eval' + assert eval_loaders[0].dataloader is not None + assert eval_loaders[0].metric_names == [] + + multi_eval_loader_cfg = ListConfig([ + { + 'name': 'text', + 'label': 'test1', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + }, + { + 'name': 'text', + 'label': 'test2', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + } + ]) + monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', + lambda *args, **kwargs: MagicMock()) + eval_loaders2 = build_eval_loaders(multi_eval_loader_cfg, tokenizer, 2) + + assert len(eval_loaders2) == 2 + + assert eval_loaders2[0].label == 'eval/test1' + assert eval_loaders2[0].dataloader is not None + assert eval_loaders2[0].metric_names == [] + + assert eval_loaders2[1].label == 'eval/test2' + assert eval_loaders2[1].dataloader is not None + assert eval_loaders2[1].metric_names == [] + + +def test_add_metrics_to_eval_loaders(): + evaluators = [ + Evaluator( + label='first', + metric_names=['a', 'b'], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ), + Evaluator( + label='second', + metric_names=[], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ), + Evaluator( + label='third', + metric_names=['c'], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ) + ] + + new_evaluators = add_metrics_to_eval_loaders( + evaluators, + { + 'new1': 'foo', + 'new2': 'bar' + }, # type: ignore + ) + assert len(new_evaluators) == 3 + + assert new_evaluators[0].label == 'second' + assert new_evaluators[0].metric_names == ['new1', 'new2'] + + assert new_evaluators[1].label == 'first' + assert new_evaluators[1].metric_names == ['a', 'b'] + + assert new_evaluators[2].label == 'third' + assert new_evaluators[2].metric_names == ['c'] diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index c35d29f74d..2e9039644b 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -21,6 +21,7 @@ from llmfoundry import (build_finetuning_dataloader, build_text_denoising_dataloader) +from llmfoundry.data import build_dataloader from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper, build_text_dataloader, get_tokens_per_batch_func) @@ -740,3 +741,13 @@ def test_token_counting_func_dataloader_setting( actual_token_count = dl.get_num_tokens_in_batch(batch_tokenized) assert actual_token_count == expected_token_count + + +def test_build_unknown_dataloader(): + cfg = DictConfig({ + 'name': 'unknown', + }) + tokenizer = MagicMock() + with pytest.raises(ValueError, + match='Expected dataloader name to be one of'): + _ = build_dataloader(cfg, tokenizer, 2) diff --git a/tests/test_eval.py b/tests/test_eval.py index 1217487b70..2fc96bb7ad 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -1,16 +1,21 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import copy import os +import pathlib import sys from typing import Any import omegaconf as om import pytest from composer import Trainer +from composer.loggers import InMemoryLogger from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils import build_tokenizer +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, + gpt_tiny_cfg) # Add repo root to path so we can import scripts and test it repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -66,3 +71,87 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any): assert expected_results in out expected_results = '| model_name | default_average | language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt | 0 | 0 |' assert expected_results in out + + +@pytest.mark.gpu +def test_loader_eval(capfd: Any, mock_saved_model_path: Any, + tmp_path: pathlib.Path): + + c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) + + # Use a training config that already has eval loader configured + test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') + + # define icl eval task + test_cfg.icl_tasks = om.ListConfig([ + om.DictConfig({ + 'label': + 'lambada_openai', + 'dataset_uri': + 'eval/local_data/language_understanding/lambada_openai_small.jsonl', + 'num_fewshot': [0], + 'icl_task_type': + 'language_modeling' + }) + ]) + + # convert the model from a training to eval model + model = test_cfg.pop('model') + eval_model = { + 'model_name': model.get('name'), + 'model': model, + 'load_path': mock_saved_model_path + } + + tokenizer = test_cfg.pop('tokenizer') + eval_model['tokenizer'] = tokenizer + test_cfg.models = [eval_model] + + # Set up multiple eval dataloaders + first_eval_loader = test_cfg.eval_loader + first_eval_loader.label = 'c4' + # Create second eval dataloader using the arxiv dataset. + second_eval_loader = copy.deepcopy(first_eval_loader) + arxiv_dataset_name = create_arxiv_dataset(tmp_path) + second_eval_loader.data_local = arxiv_dataset_name + second_eval_loader.label = 'arxiv' + test_cfg.eval_loader = om.OmegaConf.create( + [first_eval_loader, second_eval_loader]) + + test_cfg.max_duration = '1ba' + test_cfg.eval_interval = '1ba' + test_cfg.loggers = om.DictConfig({'inmemory': om.DictConfig({})}) + + trainers, eval_gauntlet_df = main(test_cfg) + + assert eval_gauntlet_df is None + assert len(trainers) == 1 # one per model + trainer = trainers[0] + + assert isinstance(trainer.logger.destinations, tuple) + + assert len(trainer.logger.destinations) > 0 + inmemorylogger = trainer.logger.destinations[ + 0] # pyright: ignore [reportGeneralTypeIssues] + assert isinstance(inmemorylogger, InMemoryLogger) + print(inmemorylogger.data.keys()) + + # Checks for first eval dataloader + assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert isinstance( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], list) + assert len( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1]) > 0 + assert isinstance( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], tuple) + + # Checks for second eval dataloader + assert 'metrics/eval/arxiv/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) + assert isinstance( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'], list) + assert len( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'][-1]) > 0 + assert isinstance( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'][-1], + tuple) diff --git a/tests/test_eval_inputs.py b/tests/test_eval_inputs.py index 9c7a130a9b..83104b62b7 100644 --- a/tests/test_eval_inputs.py +++ b/tests/test_eval_inputs.py @@ -57,6 +57,7 @@ def test_optional_mispelled_params_raise_warning(self, 'loggers', 'eval_gauntlet', 'fsdp_config', + 'eval_loader', ] old_cfg = copy.deepcopy(cfg) for param in optional_params: diff --git a/tests/test_train_inputs.py b/tests/test_train_inputs.py index bf90f48ef0..2ed1c9c239 100644 --- a/tests/test_train_inputs.py +++ b/tests/test_train_inputs.py @@ -103,7 +103,7 @@ def test_optional_mispelled_params_raise_warning(self, 'save_folder', 'fsdp_config', 'lora_config', - 'eval_loader_config', + 'eval_loader', 'icl_tasks_config', ] old_cfg = copy.deepcopy(cfg) diff --git a/tests/test_training.py b/tests/test_training.py index 8390834d1d..3cd2963100 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -3,9 +3,6 @@ import copy import os import pathlib -import shutil -import sys -from argparse import Namespace from typing import Any, Optional import pytest @@ -14,95 +11,9 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - -from scripts.data_prep.convert_dataset_hf import main as main_hf # noqa: E402 -from scripts.data_prep.convert_dataset_json import \ - main as main_json # noqa: E402 from scripts.train.train import main # noqa: E402 - - -def create_c4_dataset_xsmall(path: pathlib.Path) -> str: - """Creates a small mocked version of the C4 dataset.""" - c4_dir = os.path.join(path, f'my-copy-c4') - downloaded_split = 'val_xxsmall' - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': [downloaded_split], - 'out_root': c4_dir, - 'compression': None, - 'concat_tokens': 2048, - 'tokenizer': 'EleutherAI/gpt-neox-20b', - 'tokenizer_kwargs': {}, - 'bos_text': '', - 'eos_text': '<|endoftext|>', - 'no_wrap': False, - 'num_workers': 8 - })) - - # copy the small downloaded_split to other c4 splits for mocking purposes - mocked_splits = ['train', 'val'] - for mocked_split in mocked_splits: - shutil.copytree(os.path.join(c4_dir, 'val_xxsmall'), - os.path.join(c4_dir, mocked_split)) - assert os.path.exists(c4_dir) - return c4_dir - - -def create_arxiv_dataset(path: pathlib.Path) -> str: - """Creates an arxiv dataset.""" - arxiv_dir = os.path.join(path, f'my-copy-arxiv') - downloaded_split = 'train' - - main_json( - Namespace( - **{ - 'path': 'data_prep/example_data/arxiv.jsonl', - 'out_root': arxiv_dir, - 'compression': None, - 'split': downloaded_split, - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None - })) - - return arxiv_dir - - -def gpt_tiny_cfg(dataset_name: str, device: str): - """Create gpt tiny cfg.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/train/yamls/pretrain/testing.yaml') - with open(conf_path) as f: - test_cfg = om.load(f) - assert isinstance(test_cfg, DictConfig) - - test_cfg.data_local = dataset_name - test_cfg.global_train_batch_size = 1 - test_cfg.device_eval_batch_size = 2 - test_cfg.device_train_microbatch_size = 1 - test_cfg.max_duration = '4ba' - test_cfg.eval_interval = '4ba' - test_cfg.run_name = 'gpt-mini-integration-test' - - test_cfg.model.n_layer = 2 - test_cfg.model.n_embd = 64 - - if device == 'cpu': - test_cfg.model.init_device = 'cpu' - test_cfg.fsdp_config = None - test_cfg.model.attn_config.attn_impl = 'torch' - test_cfg.model.loss_fn = 'torch_crossentropy' - test_cfg.precision = 'fp32' - - return test_cfg +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, + gpt_tiny_cfg) @pytest.fixture(autouse=False) @@ -122,7 +33,7 @@ def set_correct_cwd(): def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, tmp_path: pathlib.Path): """Test training run with a small dataset.""" - dataset_name = create_c4_dataset_xsmall(tmp_path) + dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(dataset_name, 'cpu') test_cfg.icl_tasks = ListConfig([ DictConfig({ @@ -201,7 +112,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path): """Test training run with multiple eval datasets.""" - c4_dataset_name = create_c4_dataset_xsmall(tmp_path) + c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader