From 12e3896b0d7ffbf6267b5dd53664530a997f1f5c Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 16 Jul 2024 11:33:14 -0700 Subject: [PATCH 01/57] Command utils + train (#1361) * command_cli * save * typo * no noqa --------- Co-authored-by: v-chen_data --- llmfoundry/cli/cli.py | 2 +- llmfoundry/{train => command_utils}/__init__.py | 2 +- llmfoundry/{train => command_utils}/train.py | 0 scripts/train/train.py | 2 +- tests/a_scripts/train/test_train.py | 4 ++-- tests/a_scripts/train/test_train_inputs.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) rename llmfoundry/{train => command_utils}/__init__.py (86%) rename llmfoundry/{train => command_utils}/train.py (100%) diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 8e86e76467..f70a3ffa4e 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -6,7 +6,7 @@ import typer from llmfoundry.cli import registry_cli -from llmfoundry.train import train_from_yaml +from llmfoundry.command_utils import train_from_yaml app = typer.Typer(pretty_exceptions_show_locals=False) app.add_typer(registry_cli.app, name='registry') diff --git a/llmfoundry/train/__init__.py b/llmfoundry/command_utils/__init__.py similarity index 86% rename from llmfoundry/train/__init__.py rename to llmfoundry/command_utils/__init__.py index 8a4c2749db..cd3d699f47 100644 --- a/llmfoundry/train/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -1,6 +1,6 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from llmfoundry.train.train import ( +from llmfoundry.command_utils.train import ( TRAIN_CONFIG_KEYS, TrainConfig, train, diff --git a/llmfoundry/train/train.py b/llmfoundry/command_utils/train.py similarity index 100% rename from llmfoundry/train/train.py rename to llmfoundry/command_utils/train.py diff --git a/scripts/train/train.py b/scripts/train/train.py index 3c8973048b..728010d13a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import sys -from llmfoundry.train import train_from_yaml +from llmfoundry.command_utils import train_from_yaml if __name__ == '__main__': yaml_path, args_list = sys.argv[1], sys.argv[2:] diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index a49f1ac07a..1f724a6070 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -11,8 +11,8 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -from llmfoundry.train import TrainConfig # noqa: E402 -from llmfoundry.train import TRAIN_CONFIG_KEYS, train, validate_config +from llmfoundry.command_utils import TrainConfig # noqa: E402 +from llmfoundry.command_utils import TRAIN_CONFIG_KEYS, train, validate_config from llmfoundry.utils.config_utils import ( make_dataclass_and_log_config, update_batch_size_info, diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 328a06a69e..73540afe2f 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -9,7 +9,7 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -from llmfoundry.train import train # noqa: E402 +from llmfoundry.command_utils import train def make_fake_index_file(path: str) -> None: From 6d1f1d6b19c32ce1c368ba72a90f62b7d07bd886 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 16 Jul 2024 13:23:37 -0700 Subject: [PATCH 02/57] clear resolver (#1365) Co-authored-by: v-chen_data --- llmfoundry/command_utils/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index df0472a775..f49fb28801 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -567,6 +567,7 @@ def train_from_yaml( ) -> Trainer: """Run the training with optional overrides from CLI.""" # Load yaml and CLI arguments. + om.clear_resolver('oc.env') with open(yaml_path) as f: yaml_cfg = om.load(f) if args_list: From 94e0809c5fc8381ebd0df84878d7ea7bffc8fe28 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 16 Jul 2024 13:50:29 -0700 Subject: [PATCH 03/57] Add Eval to Foundry CLI (#1345) * move eval script * test * eval * circular import * lazy imports * break apart folders * icl * move eval to train * typo * typo * revert * revert * more revert * more revert * typo * typing * precommit * precommit * optional * precommit * move eval * move eval * rm extra * readd train * precommit * precommit * pr comments --------- Co-authored-by: v-chen_data --- llmfoundry/cli/cli.py | 15 +- llmfoundry/command_utils/__init__.py | 6 + llmfoundry/command_utils/eval.py | 447 +++++++++++++++++++++++ scripts/eval/eval.py | 440 +--------------------- tests/a_scripts/eval/test_eval.py | 8 +- tests/a_scripts/eval/test_eval_inputs.py | 8 +- 6 files changed, 477 insertions(+), 447 deletions(-) create mode 100644 llmfoundry/command_utils/eval.py diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index f70a3ffa4e..606f6855ff 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -6,7 +6,7 @@ import typer from llmfoundry.cli import registry_cli -from llmfoundry.command_utils import train_from_yaml +from llmfoundry.command_utils import eval_from_yaml, train_from_yaml app = typer.Typer(pretty_exceptions_show_locals=False) app.add_typer(registry_cli.app, name='registry') @@ -25,5 +25,18 @@ def train( train_from_yaml(yaml_path, args_list) +@app.command(name='eval') +def eval( + yaml_path: str = typer.Argument( + ..., + help='Path to the YAML configuration file', + ), # type: ignore + args_list: Optional[list[str]] = typer. + Argument(None, help='Additional command line arguments'), # type: ignore +): + """Run the training with optional overrides from CLI.""" + eval_from_yaml(yaml_path, args_list) + + if __name__ == '__main__': app() diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index cd3d699f47..7dd8a32c36 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -1,5 +1,9 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from llmfoundry.command_utils.eval import ( + eval_from_yaml, + evaluate, +) from llmfoundry.command_utils.train import ( TRAIN_CONFIG_KEYS, TrainConfig, @@ -14,4 +18,6 @@ 'TrainConfig', 'TRAIN_CONFIG_KEYS', 'validate_config', + 'evaluate', + 'eval_from_yaml', ] diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py new file mode 100644 index 0000000000..7d8306c0a0 --- /dev/null +++ b/llmfoundry/command_utils/eval.py @@ -0,0 +1,447 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import time +from typing import Any, Dict, Optional, Tuple, Union + +import pandas as pd +import torch +from composer.core import Callback +from composer.loggers.logger_destination import LoggerDestination +from composer.trainer import Trainer +from composer.utils import dist, get_device, reproducibility +from omegaconf import DictConfig +from omegaconf import OmegaConf as om + +from llmfoundry.utils import ( + find_mosaicml_logger, + log_eval_analytics, + maybe_create_mosaicml_logger, +) +from llmfoundry.utils.builders import ( + add_metrics_to_eval_loaders, + build_callback, + build_composer_model, + build_evaluators, + build_logger, + build_tokenizer, +) +from llmfoundry.utils.config_utils import ( + EVAL_CONFIG_KEYS, + EvalConfig, + log_config, + make_dataclass_and_log_config, + process_init_device, +) +from llmfoundry.utils.registry_utils import import_file + +log = logging.getLogger(__name__) + + +def evaluate_model( + tokenizer: Dict[str, Any], + model_name: str, + model: Dict[str, Any], + dist_timeout: Union[float, int], + run_name: str, + seed: int, + icl_tasks: Union[str, list[Dict[str, Any]]], + max_seq_len: int, + device_eval_batch_size: Union[int, float], + eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], + eval_loader_config: Optional[Union[Dict[str, Any], list[Dict[str, Any]]]], + fsdp_config: Optional[Dict[str, Any]], + loggers: list[LoggerDestination], + python_log_level: Optional[str], + precision: str, + eval_gauntlet_df: Optional[pd.DataFrame], + eval_subset_num_batches: int, + icl_subset_num_batches: Optional[int], + callback_configs: Optional[Dict[str, Any]], + metadata: Optional[Dict[str, str]], + logged_config: Dict[str, Any], + should_log_config: bool = True, + load_path: Optional[str] = None, +): + log.info(f'Evaluating model: {model_name}') + # Build tokenizer and model + tokenizer_cfg = tokenizer + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=max_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) + + # Callbacks + callbacks: list[Callback] = [ + build_callback(name=str(name), kwargs=callback_cfg) + for name, callback_cfg in callback_configs.items() + ] if callback_configs else [] + + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) + + if metadata is not None: + # Find the MosaicMLLogger + mosaicml_logger = find_mosaicml_logger(loggers) + + if mosaicml_logger is not None: + mosaicml_logger.log_metrics(metadata) + mosaicml_logger._flush_metadata(force_flush=True) + + if fsdp_config and model.get('load_in_8bit', False): + raise ValueError( + 'The FSDP config block is not supported when loading ' + + 'Hugging Face models in 8bit.', + ) + + init_context = process_init_device(model, fsdp_config) + + name = model.pop('name') + composer_model = build_composer_model( + name=name, + tokenizer=tokenizer, + init_context=init_context, + cfg=model, + ) + + # Now add the eval metrics + if eval_loader_config is not None: + train_metrics = composer_model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders( + evaluators, + list(train_metrics.keys()), + ) + + if eval_gauntlet_df is None and eval_gauntlet_callback is not None: + eval_gauntlet_df = pd.DataFrame( + columns=['model_name'] + list(eval_gauntlet_callback.averages) + + [t['name'] for t in eval_gauntlet_callback.categories], + ) + + if name == 'mpt_causal_lm' and load_path is None: + raise ValueError( + 'MPT causal LMs require a load_path to the checkpoint for model evaluation.' + + + ' Please check your yaml and the model_cfg to ensure that load_path is set.', + ) + + assert composer_model is not None + + log.info(f'Building trainer for {model_name}...') + trainer = Trainer( + run_name=run_name, + seed=seed, + model=composer_model, + callbacks=callbacks, + loggers=loggers, + precision=precision, + fsdp_config=fsdp_config, + load_path=load_path, + load_weights_only=True, + progress_bar=False, + log_to_console=True, + dist_timeout=dist_timeout, + python_log_level=python_log_level, + ) + + if should_log_config: + log.info('Evaluation config:') + log_config(logged_config) + + log.info(f'Starting eval for {model_name}...') + if torch.cuda.is_available(): + torch.cuda.synchronize() + a = time.time() + trainer.eval( + eval_dataloader=evaluators, + subset_num_batches=eval_subset_num_batches, + ) + if torch.cuda.is_available(): + torch.cuda.synchronize() + b = time.time() + + log.info(f'Ran {model_name} eval in: {b-a} seconds') + return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) + + +def evaluate(cfg: DictConfig) -> Tuple[list[Trainer], pd.DataFrame]: + # Run user provided code if specified + for code_path in cfg.get('code_paths', []): + import_file(code_path) + + logged_cfg, eval_config = make_dataclass_and_log_config( + cfg, + EvalConfig, + EVAL_CONFIG_KEYS, + icl_tasks_required=True, + ) + + model_configs = eval_config.models + eval_gauntlet_config = eval_config.eval_gauntlet or eval_config.eval_gauntlet_str + + fsdp_config = eval_config.fsdp_config + + # Mandatory Evaluation Parameters + icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str + if icl_tasks is None: + raise ValueError('icl_tasks must be specified in the config') + + # Optional Evaluation Parameters with default values + eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders + default_run_name: str = os.environ.get('RUN_NAME', 'llm') + run_name = eval_config.run_name if eval_config.run_name else default_run_name + + reproducibility.seed_all(eval_config.seed) + dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) + + if eval_config.python_log_level is not None: + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', + ) + logging.getLogger('llmfoundry').setLevel( + eval_config.python_log_level.upper(), + ) + + # Default argument values for evaluate_model + eval_gauntlet_df = None + models_df = None + composite_scores = None + trainers = [] + + # Build loggers + loggers: list[LoggerDestination] = [ + build_logger(name, logger_cfg) + for name, logger_cfg in (eval_config.loggers or {}).items() + ] + + mosaicml_logger = find_mosaicml_logger(loggers) + if mosaicml_logger is None: + mosaicml_logger = maybe_create_mosaicml_logger() + # mosaicml_logger will be None if run isn't on MosaicML platform + if mosaicml_logger is not None: + loggers.append(mosaicml_logger) + + # mosaicml_logger will be None if the run isn't from the MosaicML platform + if mosaicml_logger is not None: + log_eval_analytics( + mosaicml_logger, + model_configs, + icl_tasks, + eval_gauntlet_config, + ) + + for model_cfg in model_configs: + + attn_config = model_cfg['model'].get('attn_config', None) + if attn_config is not None: + seq_parallel_world_size = attn_config.get( + 'seq_parallel_world_size', + None, + ) + if seq_parallel_world_size is not None and seq_parallel_world_size != 1: + raise ValueError( + 'Offline eval does not support sequence parallelism.', + ) + + (trainer, logger_keys, eval_gauntlet_callback, + eval_gauntlet_df) = evaluate_model( + dist_timeout=eval_config.dist_timeout, + run_name=run_name, + seed=eval_config.seed, + icl_tasks=icl_tasks, + max_seq_len=eval_config.max_seq_len, + device_eval_batch_size=eval_config.device_eval_batch_size, + eval_gauntlet_config=eval_gauntlet_config, + eval_loader_config=eval_loader_config, + fsdp_config=fsdp_config, + loggers=loggers, + python_log_level=eval_config.python_log_level, + precision=eval_config.precision, + eval_gauntlet_df=eval_gauntlet_df, + callback_configs=eval_config.callbacks, + eval_subset_num_batches=eval_config.eval_subset_num_batches, + icl_subset_num_batches=eval_config.icl_subset_num_batches, + metadata=eval_config.metadata, + logged_config=logged_cfg, + should_log_config=eval_config.log_config, + **model_cfg, + ) + trainers.append(trainer) + + if eval_gauntlet_callback is not None: + composite_scores = eval_gauntlet_callback.eval_after_all( + trainer.state, + trainer.logger, + ) + + benchmark_to_taxonomy = {} + if eval_gauntlet_callback is not None: + for t in eval_gauntlet_callback.categories: + for b in t['benchmarks']: + benchmark_to_taxonomy[b['name']] = t['name'] + + assert 'model_name' in model_cfg, 'model_name must be specified in model config' + model_results = calculate_markdown_results( + logger_keys, + trainer, + benchmark_to_taxonomy, + model_cfg['model_name'], + ) + + if models_df is None: + models_df = model_results + else: + models_df = pd.concat([models_df, model_results], ignore_index=True) + + if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: + assert composite_scores is not None + row = {'model_name': model_cfg['model_name']} + row.update({ + k.split('/')[-1]: v for k, v in composite_scores.items() + }) + eval_gauntlet_df = pd.concat([ + eval_gauntlet_df, + pd.DataFrame([row]), + ], + ignore_index=True) + + print(f'Printing gauntlet results for all models') + + print( + eval_gauntlet_df.sort_values( + list(eval_gauntlet_callback.averages.keys())[0], + ascending=False, + ).to_markdown(index=False), + ) + print(f'Printing complete results for all models') + assert models_df is not None + print(models_df.to_markdown(index=False)) + + trainer.close() + + return trainers, eval_gauntlet_df + + +def calculate_markdown_results( + logger_keys: list[str], + trainer: Trainer, + benchmark_to_taxonomy: Dict[str, str], + model_name: str, +): + results = {} + + for key in logger_keys: + # dl_name is either 2-tuple (benchmark_name, num_fewshot) + # or 3-tuple (benchmark_name, num_fewshot, subcategory) + dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1] + if 'Accuracy' not in metric_name: + continue + + metric = trainer.state.eval_metrics.get('/'.join(dl_name), + {}).get(metric_name, None) + + if metric is None: + continue + if dl_name[1] not in results: + results[dl_name[1]] = {} + + if dl_name[0] not in results[dl_name[1]]: + results[dl_name[1]][dl_name[0]] = {} + + if metric_name not in results[dl_name[1]][dl_name[0]]: + results[dl_name[1]][dl_name[0]][metric_name] = [] + + results[dl_name[1]][dl_name[0]][metric_name].append({ + 'val': metric.compute(), + 'subcat': dl_name[-1] if len(dl_name) == 3 else 'no_subcat', + }) + + df = pd.DataFrame( + columns=[ + 'Category', + 'Benchmark', + 'Subtask', + 'Accuracy', + 'Number few shot', + 'Model', + ], + ) + + for num_shot in results: + for benchmark in results[num_shot]: + for metric in results[num_shot][benchmark]: + subscores = results[num_shot][benchmark][metric] + if len(subscores) == 1: + row = { + 'Category': benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': benchmark, + 'Subtask': None, + 'Accuracy': subscores[0]['val'], + 'Number few shot': num_shot, + 'Model': model_name, + } + df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) + else: + row = { + 'Category': + benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': + benchmark, + 'Subtask': + 'Average', + 'Accuracy': + sum(s['val'] for s in subscores) / len(subscores), + 'Number few shot': + num_shot, + 'Model': + model_name, + } + df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) + for sub in subscores: + row = { + 'Category': + benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': + None, + 'Subtask': + sub['subcat'], + 'Accuracy': + sub['val'], + 'Number few shot': + num_shot, + 'Model': + model_name, + } + df = pd.concat([df, pd.DataFrame([row])], + ignore_index=True) + return df + + +def eval_from_yaml( + yaml_path: str, + args_list: Optional[list[str]], +) -> Tuple[list[Trainer], pd.DataFrame]: + """Run the evaluation with optional overrides from CLI.""" + # Load yaml and CLI arguments. + om.clear_resolver('oc.env') + with open(yaml_path) as f: + yaml_cfg = om.load(f) + if args_list: + cli_cfg = om.from_cli(args_list) + yaml_cfg = om.merge(yaml_cfg, cli_cfg) + assert isinstance(yaml_cfg, DictConfig) + return evaluate(yaml_cfg) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 9667db6308..caafda4b87 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,445 +1,9 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - -import logging -import os import sys -import time -from typing import Any, Dict, List, Optional, Tuple, Union - -import pandas as pd -import torch -from composer.core import Callback -from composer.loggers.logger_destination import LoggerDestination -from composer.trainer import Trainer -from composer.utils import dist, get_device, reproducibility -from omegaconf import DictConfig -from omegaconf import OmegaConf as om -from rich.traceback import install - -from llmfoundry.utils import ( - find_mosaicml_logger, - log_eval_analytics, - maybe_create_mosaicml_logger, -) - -install() -from llmfoundry.utils.builders import ( - add_metrics_to_eval_loaders, - build_callback, - build_composer_model, - build_evaluators, - build_logger, - build_tokenizer, -) -from llmfoundry.utils.config_utils import ( - EVAL_CONFIG_KEYS, - EvalConfig, - log_config, - make_dataclass_and_log_config, - process_init_device, -) -from llmfoundry.utils.registry_utils import import_file - -log = logging.getLogger(__name__) - - -def evaluate_model( - tokenizer: Dict[str, Any], - model_name: str, - model: Dict[str, Any], - dist_timeout: Union[float, int], - run_name: str, - seed: int, - icl_tasks: Union[str, List[Dict[str, Any]]], - max_seq_len: int, - device_eval_batch_size: Union[int, float], - eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], - eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], - fsdp_config: Optional[Dict[str, Any]], - loggers: List[LoggerDestination], - python_log_level: Optional[str], - precision: str, - eval_gauntlet_df: Optional[pd.DataFrame], - eval_subset_num_batches: int, - icl_subset_num_batches: Optional[int], - callback_configs: Optional[Dict[str, Any]], - metadata: Optional[Dict[str, str]], - logged_config: Dict[str, Any], - should_log_config: bool = True, - load_path: Optional[str] = None, -): - log.info(f'Evaluating model: {model_name}') - # Build tokenizer and model - tokenizer_cfg = tokenizer - tokenizer_name = tokenizer_cfg['name'] - tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( - eval_loader_config, - icl_tasks, - eval_gauntlet_config, - tokenizer=tokenizer, - device_eval_batch_size=device_eval_batch_size, - icl_seq_len=max_seq_len, - icl_subset_num_batches=icl_subset_num_batches, - ) - - # Callbacks - callbacks: List[Callback] = [ - build_callback(name=str(name), kwargs=callback_cfg) - for name, callback_cfg in callback_configs.items() - ] if callback_configs else [] - - if eval_gauntlet_callback is not None: - callbacks.append(eval_gauntlet_callback) - - if metadata is not None: - # Find the MosaicMLLogger - mosaicml_logger = find_mosaicml_logger(loggers) - - if mosaicml_logger is not None: - mosaicml_logger.log_metrics(metadata) - mosaicml_logger._flush_metadata(force_flush=True) - - if fsdp_config and model.get('load_in_8bit', False): - raise ValueError( - 'The FSDP config block is not supported when loading ' + - 'Hugging Face models in 8bit.', - ) - - init_context = process_init_device(model, fsdp_config) - - name = model.pop('name') - composer_model = build_composer_model( - name=name, - tokenizer=tokenizer, - init_context=init_context, - cfg=model, - ) - - # Now add the eval metrics - if eval_loader_config is not None: - train_metrics = composer_model.get_metrics(is_train=True) - evaluators = add_metrics_to_eval_loaders( - evaluators, - list(train_metrics.keys()), - ) - - if eval_gauntlet_df is None and eval_gauntlet_callback is not None: - eval_gauntlet_df = pd.DataFrame( - columns=['model_name'] + list(eval_gauntlet_callback.averages) + - [t['name'] for t in eval_gauntlet_callback.categories], - ) - - if name == 'mpt_causal_lm' and load_path is None: - raise ValueError( - 'MPT causal LMs require a load_path to the checkpoint for model evaluation.' - + - ' Please check your yaml and the model_cfg to ensure that load_path is set.', - ) - - assert composer_model is not None - - log.info(f'Building trainer for {model_name}...') - trainer = Trainer( - run_name=run_name, - seed=seed, - model=composer_model, - callbacks=callbacks, - loggers=loggers, - precision=precision, - fsdp_config=fsdp_config, - load_path=load_path, - load_weights_only=True, - progress_bar=False, - log_to_console=True, - dist_timeout=dist_timeout, - python_log_level=python_log_level, - ) - - if should_log_config: - log.info('Evaluation config:') - log_config(logged_config) - - log.info(f'Starting eval for {model_name}...') - if torch.cuda.is_available(): - torch.cuda.synchronize() - a = time.time() - trainer.eval( - eval_dataloader=evaluators, - subset_num_batches=eval_subset_num_batches, - ) - if torch.cuda.is_available(): - torch.cuda.synchronize() - b = time.time() - - log.info(f'Ran {model_name} eval in: {b-a} seconds') - return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) - - -def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: - # Run user provided code if specified - for code_path in cfg.get('code_paths', []): - import_file(code_path) - - logged_cfg, eval_config = make_dataclass_and_log_config( - cfg, - EvalConfig, - EVAL_CONFIG_KEYS, - icl_tasks_required=True, - ) - - model_configs = eval_config.models - eval_gauntlet_config = eval_config.eval_gauntlet or eval_config.eval_gauntlet_str - - fsdp_config = eval_config.fsdp_config - - # Mandatory Evaluation Parameters - icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str - if icl_tasks is None: - raise ValueError('icl_tasks must be specified in the config') - - # Optional Evaluation Parameters with default values - eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders - default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name = eval_config.run_name if eval_config.run_name else default_run_name - - reproducibility.seed_all(eval_config.seed) - dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) - - if eval_config.python_log_level is not None: - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', - ) - logging.getLogger('llmfoundry').setLevel( - eval_config.python_log_level.upper(), - ) - - # Default argument values for evaluate_model - eval_gauntlet_df = None - models_df = None - composite_scores = None - trainers = [] - - # Build loggers - loggers: List[LoggerDestination] = [ - build_logger(name, logger_cfg) - for name, logger_cfg in (eval_config.loggers or {}).items() - ] - - mosaicml_logger = find_mosaicml_logger(loggers) - if mosaicml_logger is None: - mosaicml_logger = maybe_create_mosaicml_logger() - # mosaicml_logger will be None if run isn't on MosaicML platform - if mosaicml_logger is not None: - loggers.append(mosaicml_logger) - - # mosaicml_logger will be None if the run isn't from the MosaicML platform - if mosaicml_logger is not None: - log_eval_analytics( - mosaicml_logger, - model_configs, - icl_tasks, - eval_gauntlet_config, - ) - - for model_cfg in model_configs: - - attn_config = model_cfg['model'].get('attn_config', None) - if attn_config is not None: - seq_parallel_world_size = attn_config.get( - 'seq_parallel_world_size', - None, - ) - if seq_parallel_world_size is not None and seq_parallel_world_size != 1: - raise ValueError( - 'Offline eval does not support sequence parallelism.', - ) - - (trainer, logger_keys, eval_gauntlet_callback, - eval_gauntlet_df) = evaluate_model( - dist_timeout=eval_config.dist_timeout, - run_name=run_name, - seed=eval_config.seed, - icl_tasks=icl_tasks, - max_seq_len=eval_config.max_seq_len, - device_eval_batch_size=eval_config.device_eval_batch_size, - eval_gauntlet_config=eval_gauntlet_config, - eval_loader_config=eval_loader_config, - fsdp_config=fsdp_config, - loggers=loggers, - python_log_level=eval_config.python_log_level, - precision=eval_config.precision, - eval_gauntlet_df=eval_gauntlet_df, - callback_configs=eval_config.callbacks, - eval_subset_num_batches=eval_config.eval_subset_num_batches, - icl_subset_num_batches=eval_config.icl_subset_num_batches, - metadata=eval_config.metadata, - logged_config=logged_cfg, - should_log_config=eval_config.log_config, - **model_cfg, - ) - trainers.append(trainer) - - if eval_gauntlet_callback is not None: - composite_scores = eval_gauntlet_callback.eval_after_all( - trainer.state, - trainer.logger, - ) - - benchmark_to_taxonomy = {} - if eval_gauntlet_callback is not None: - for t in eval_gauntlet_callback.categories: - for b in t['benchmarks']: - benchmark_to_taxonomy[b['name']] = t['name'] - - assert 'model_name' in model_cfg, 'model_name must be specified in model config' - model_results = calculate_markdown_results( - logger_keys, - trainer, - benchmark_to_taxonomy, - model_cfg['model_name'], - ) - - if models_df is None: - models_df = model_results - else: - models_df = pd.concat([models_df, model_results], ignore_index=True) - - if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: - assert composite_scores is not None - row = {'model_name': model_cfg['model_name']} - row.update({ - k.split('/')[-1]: v for k, v in composite_scores.items() - }) - eval_gauntlet_df = pd.concat([ - eval_gauntlet_df, - pd.DataFrame([row]), - ], - ignore_index=True) - - print(f'Printing gauntlet results for all models') - - print( - eval_gauntlet_df.sort_values( - list(eval_gauntlet_callback.averages.keys())[0], - ascending=False, - ).to_markdown(index=False), - ) - print(f'Printing complete results for all models') - assert models_df is not None - print(models_df.to_markdown(index=False)) - - trainer.close() - - return trainers, eval_gauntlet_df - - -def calculate_markdown_results( - logger_keys: List[str], - trainer: Trainer, - benchmark_to_taxonomy: Dict[str, str], - model_name: str, -): - results = {} - - for key in logger_keys: - # dl_name is either 2-tuple (benchmark_name, num_fewshot) - # or 3-tuple (benchmark_name, num_fewshot, subcategory) - dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1] - if 'Accuracy' not in metric_name: - continue - - metric = trainer.state.eval_metrics.get('/'.join(dl_name), - {}).get(metric_name, None) - - if metric is None: - continue - if dl_name[1] not in results: - results[dl_name[1]] = {} - - if dl_name[0] not in results[dl_name[1]]: - results[dl_name[1]][dl_name[0]] = {} - - if metric_name not in results[dl_name[1]][dl_name[0]]: - results[dl_name[1]][dl_name[0]][metric_name] = [] - - results[dl_name[1]][dl_name[0]][metric_name].append({ - 'val': metric.compute(), - 'subcat': dl_name[-1] if len(dl_name) == 3 else 'no_subcat', - }) - - df = pd.DataFrame( - columns=[ - 'Category', - 'Benchmark', - 'Subtask', - 'Accuracy', - 'Number few shot', - 'Model', - ], - ) - - for num_shot in results: - for benchmark in results[num_shot]: - for metric in results[num_shot][benchmark]: - subscores = results[num_shot][benchmark][metric] - if len(subscores) == 1: - row = { - 'Category': benchmark_to_taxonomy.get(benchmark, ''), - 'Benchmark': benchmark, - 'Subtask': None, - 'Accuracy': subscores[0]['val'], - 'Number few shot': num_shot, - 'Model': model_name, - } - df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) - else: - row = { - 'Category': - benchmark_to_taxonomy.get(benchmark, ''), - 'Benchmark': - benchmark, - 'Subtask': - 'Average', - 'Accuracy': - sum(s['val'] for s in subscores) / len(subscores), - 'Number few shot': - num_shot, - 'Model': - model_name, - } - df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) - for sub in subscores: - row = { - 'Category': - benchmark_to_taxonomy.get(benchmark, ''), - 'Benchmark': - None, - 'Subtask': - sub['subcat'], - 'Accuracy': - sub['val'], - 'Number few shot': - num_shot, - 'Model': - model_name, - } - df = pd.concat([df, pd.DataFrame([row])], - ignore_index=True) - return df +from llmfoundry.command_utils import eval_from_yaml if __name__ == '__main__': yaml_path, args_list = sys.argv[1], sys.argv[2:] - with open(yaml_path) as f: - yaml_cfg = om.load(f) - cli_cfg = om.from_cli(args_list) - cfg = om.merge(yaml_cfg, cli_cfg) - assert isinstance(cfg, DictConfig) - main(cfg) + eval_from_yaml(yaml_path, args_list) diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 01f3760d26..fc0dc8a882 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -11,10 +11,10 @@ from composer import Trainer from composer.loggers import InMemoryLogger +from llmfoundry.command_utils import evaluate from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model from llmfoundry.utils.config_utils import EVAL_CONFIG_KEYS, to_dict_container -from scripts.eval.eval import main # noqa: E402 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg @@ -75,7 +75,7 @@ def test_icl_eval( eval_cfg = copy.deepcopy(eval_cfg) eval_cfg.models[0].load_path = mock_saved_model_path assert isinstance(eval_cfg, om.DictConfig) - main(eval_cfg) + evaluate(eval_cfg) out, _ = capfd.readouterr() expected_results = '| Category | Benchmark | Subtask | Accuracy | Number few shot | Model |\n|:----------------------------|:---------------|:----------|-----------:|:------------------|:---------|\n| language_understanding_lite | lambada_openai | | 0 | 0-shot | tiny_mpt |' assert expected_results in out @@ -135,14 +135,14 @@ def test_loader_eval( test_cfg.loggers = om.DictConfig({'inmemory': om.DictConfig({})}) # This test uses a training yaml with training-only keys present. - # We exclude these keys before calling `main` from the eval script. + # We exclude these keys before calling `evaluate` from the eval script. allowed_keys = EVAL_CONFIG_KEYS present_keys = set(test_cfg.keys()) keys_to_pop = present_keys.difference(allowed_keys) [test_cfg.pop(key) for key in keys_to_pop] - trainers, eval_gauntlet_df = main(test_cfg) + trainers, eval_gauntlet_df = evaluate(test_cfg) assert eval_gauntlet_df is None assert len(trainers) == 1 # one per model diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py index 0ca5765a26..86243ba154 100644 --- a/tests/a_scripts/eval/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -8,7 +8,7 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -from scripts.eval.eval import main # noqa: E402 +from llmfoundry.command_utils import evaluate class TestHuggingFaceEvalYAMLInputs: @@ -44,7 +44,7 @@ def test_mispelled_mandatory_params_fail(self, cfg: DictConfig) -> None: ValueError, )): cfg[p + '-mispelled'] = cfg.pop(p) - main(cfg) + evaluate(cfg) cfg[p] = cfg.pop(p + '-mispelled') def test_optional_mispelled_params_raise_error( @@ -68,7 +68,7 @@ def test_optional_mispelled_params_raise_error( updated_param = param + '-mispelling' cfg[updated_param] = orig_value with pytest.raises(ValueError): - main(cfg) + evaluate(cfg) # restore configs. cfg = copy.deepcopy(old_cfg) @@ -105,4 +105,4 @@ def test_empty_load_path_raises_error(self, cfg: DictConfig) -> None: + ' Please check your yaml and the model_cfg to ensure that load_path is set.' cfg.models[0].load_path = None with pytest.raises(ValueError, match=error_string): - main(cfg) + evaluate(cfg) From cabc1a713a801f6ca8b415ec65c716eaf65538cc Mon Sep 17 00:00:00 2001 From: Vansh Singh Date: Tue, 16 Jul 2024 15:21:49 -0700 Subject: [PATCH 04/57] Enhanced Logging for convert_delta_to_json and convert_text_to_mds (#1366) * Add in generic logs * add back method * typo * remove newline * single quote logs only * fix log.error message quotes too * One remaining double-quote string * autoformat * format * remove whitespace * ruff --- scripts/data_prep/convert_delta_to_json.py | 34 +++++++++++++++- scripts/data_prep/convert_text_to_mds.py | 46 +++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/scripts/data_prep/convert_delta_to_json.py b/scripts/data_prep/convert_delta_to_json.py index f664f5baca..3b88ba668f 100644 --- a/scripts/data_prep/convert_delta_to_json.py +++ b/scripts/data_prep/convert_delta_to_json.py @@ -82,6 +82,8 @@ def to_cf(self: SparkConnectClient, - Total row count of all parts of the result. - A boolean indicating whether the result has been truncated. """ + log.info(f'Executing query plan with format: {type}') + req = self._execute_plan_request_with_metadata() req.plan.CopyFrom(plan) @@ -166,6 +168,7 @@ def collect_as_cf(self: DataFrame, - Total row count of all parts of the result. - A boolean indicating whether the result is truncated or overflowed. """ + log.info(f'Collecting DataFrame as cloud fetch with format: {type}') query = self._plan.to_proto(self._session.client) # pyright: ignore return self._session.client.to_cf(query, type) # pyright: ignore @@ -182,13 +185,18 @@ def iterative_combine_jsons(json_directory: str, output_file: str) -> None: json_directory(str): directory containing the JSONL files output_file(str): path to the output combined JSONL file """ + log.info( + f'Starting to combine JSON files from {json_directory} into {output_file}', + ) json_files = [f for f in os.listdir(json_directory) if f.endswith('.jsonl')] + log.info(f'Found {len(json_files)} JSON files to combine') with open(output_file, 'w') as outfile: for file_name in json_files: + log.debug(f'Processing file: {file_name}') with open(os.path.join(json_directory, file_name), 'r') as infile: for line in infile: outfile.write(line) - log.info('JSON files have been combined into a JSONL file.') + log.info('JSON files have been successfully combined into a JSONL file.') def run_query( @@ -207,6 +215,9 @@ def run_query( spark (Optional[SparkSession]): spark session collect (bool): whether to get the underlying data from spark dataframe """ + log.info(f'Executing query using method: {method}') + log.debug(f'Query: {query}') + if method == 'dbsql': if cursor is None: raise ValueError(f'cursor cannot be None if using method dbsql') @@ -247,6 +258,8 @@ def download( resp_format (str): whether to use arrow or json when collect compressed (bool): if data is compressed before downloading. Need decompress if compressed=True. """ + log.info(f'Downloading part {ipart} from URL: {url}') + resp = requests.get(url) if resp.status_code == 200: if resp_format == 'json': @@ -294,6 +307,7 @@ def format_tablename(table_name: str) -> str: Args: table_name (str): catalog.scheme.tablename on UC """ + log.debug(f'Formatting table name: {table_name}') match = re.match(TABLENAME_PATTERN, table_name) if match is None: @@ -337,6 +351,7 @@ def fetch_data( Returns: None: The function doesn't return any value, but writes the result to a JSONL file. """ + log.info(f'Fetching data from {start} to {end} using method: {method}') query = f""" WITH NumberedRows AS ( SELECT @@ -428,6 +443,11 @@ def fetch( sparkSession (pyspark.sql.sparksession): spark session dbsql (databricks.sql.connect): dbsql session """ + log.info(f'Starting data fetch for table: {tablename}') + log.info( + f'Method: {method}, Batch size: {batch_size}, Processes: {processes}', + ) + cursor = dbsql.cursor() if dbsql is not None else None try: nrows = get_total_rows( @@ -505,6 +525,11 @@ def validate_and_get_cluster_info( http_path (Optional[str]): http path to use for sql connect use_serverless (bool): whether to use serverless or not """ + log.info('Validating cluster information and getting connection details') + log.debug( + f'Cluster ID: {cluster_id}, Host: {databricks_host}, Use Serverless: {use_serverless}', + ) + method = 'dbsql' dbsql = None sparkSession = None @@ -575,6 +600,10 @@ def validate_and_get_cluster_info( def fetch_DT(args: Namespace) -> None: """Fetch UC Delta Table to local as jsonl.""" log.info(f'Start .... Convert delta to json') + log.info('Starting Delta Table to JSON conversion process') + log.info(f'Delta Table: {args.delta_table_name}') + log.info(f'Output Folder: {args.json_output_folder}') + log.info(f'Output Filename: {args.json_output_filename}') obj = urllib.parse.urlparse(args.json_output_folder) if obj.scheme != '': @@ -626,6 +655,8 @@ def fetch_DT(args: Namespace) -> None: os.path.join(args.json_output_folder, args.json_output_filename), ) + log.info('Delta Table to JSON conversion completed successfully') + if __name__ == '__main__': parser = ArgumentParser( @@ -695,3 +726,4 @@ def fetch_DT(args: Namespace) -> None: tik = time.time() fetch_DT(args) log.info(f'Elapsed time {time.time() - tik}') + log.info('Delta Table to JSON conversion script completed') diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 92c36eb35d..8af8280465 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -58,11 +58,15 @@ def __init__( ): self.files = files super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) + log.info(f'Initialized ConcatTokensFromFilesDataset.') def __iter__(self) -> Iterable[Dict[str, NDArray]]: - + log.info( + 'Starting iteration over files in ConcatTokensFromFilesDataset', + ) buffer = [] for file in self.files: + log.info(f'Processing file: {file}') with open(file, 'r') as f: buffer += self.bos_tokens first_chunk = True @@ -102,6 +106,10 @@ def __iter__(self) -> Iterable[Dict[str, NDArray]]: buffer = buffer[self.max_length:] if self.should_wrap else [] yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} + log.info( + 'Finished iterating over files in ConcatTokensFromFilesDataset', + ) + def parse_args() -> Namespace: """Parse commandline arguments.""" @@ -238,6 +246,7 @@ def get_object_names(input_folder: str) -> List[str]: name for name in object_store.list_objects(folder_prefix) if name.endswith('.txt') ] + log.info(f'Found {len(names)} text files in remote storage') else: # input_folder is a local folder names = [ @@ -280,10 +289,16 @@ def get_task_args( compression (str): The compression algorithm to use for MDS writing trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + log.info( + f'Preparing task arguments for {len(object_names)} objects across {n_groups} groups', + ) num_objects = len(object_names) objs_per_group = math.ceil(num_objects / n_groups) for group, i in enumerate(range(0, num_objects, objs_per_group)): output_subdir = os.path.join(output_root, str(group)) + log.info( + f'Created task for group {group} with {min(objs_per_group, num_objects - i)} objects', + ) yield ( object_names[i:min(i + objs_per_group, num_objects)], output_subdir, @@ -332,15 +347,19 @@ def download_and_convert( compression (str): The compression algorithm to use for MDS writing trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + log.info(f'Starting download and conversion for {len(file_names)} files') + object_store = maybe_create_object_store_from_uri(input_folder) # Download file_names with tempfile.TemporaryDirectory() as tmp_dir: + log.info(f'Created temporary directory: {tmp_dir}') downloading_iter = DownloadingIterable( object_names=file_names, output_folder=tmp_dir, object_store=object_store, ) + log.info(f'Initializing tokenizer: {tokenizer_name}') tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, trust_remote_code=trust_remote_code, @@ -369,6 +388,8 @@ def download_and_convert( for sample in tqdm(dataset): out.write(sample) + log.info(f'Completed download and conversion for {len(file_names)} files') + def is_remote_path(path: str) -> bool: """Checks whether a path is a remote path. @@ -394,6 +415,10 @@ def is_already_processed( args_str (str): String representation of the arguments object_names (List[str]): Names of objects to convert to MDS format """ + log.info( + f'Checking if {len(object_names)} objects have already been processed in {output_root}', + ) + # Retrieve the done file contents output_object_store = maybe_create_object_store_from_uri(output_root) if output_object_store is not None: @@ -412,27 +437,37 @@ def is_already_processed( ) with open(done_file) as df: done_file_contents = df.read().splitlines() + log.info(f'Retrieved done file contents from remote storage') except FileNotFoundError: + log.info('Done file not found in remote storage') return False else: # Read the local done file done_file = os.path.join(output_root, DONE_FILENAME) if not os.path.isfile(done_file): + log.info('Done file not found in local storage') return False with open(done_file) as df: done_file_contents = df.read().splitlines() + log.info(f'Retrieved done file contents from local storage') + # Compare the arguments prev_args_str = done_file_contents[0] if prev_args_str != args_str: + log.info('Arguments have changed, reprocessing required') return False # Compare file names prev_names = done_file_contents[1:] if len(prev_names) != len(object_names): + log.info('Number of files has changed, reprocessing required') return False for idx, prev_name in enumerate(prev_names): if object_names[idx] != prev_name: + log.info('File names have changed, reprocessing required') return False + + log.info('All files have already been processed') return True @@ -448,7 +483,9 @@ def write_done_file(folder: str, args_str: str, object_names: List[str]): object_names (List[str]): List of objects to convert to MDS format """ with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + log.info(f'Writing done file.') done_file.write('\n'.join([args_str] + object_names) + '\n') + log.info(f'Done file written successfully') def convert_text_to_mds( @@ -482,9 +519,11 @@ def convert_text_to_mds( trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ is_remote_output = is_remote_path(output_folder) + log.info(f'Output is remote: {is_remote_output}') object_names = get_object_names(input_folder) if len(object_names) == 0: + log.error(f'No text files found in input folder: {input_folder}') raise InputFolderMissingDataError(input_folder) # Check if the text files in the bucket have already been processed. @@ -503,11 +542,14 @@ def convert_text_to_mds( # Use a temporary local directory if the output is remote and there are more than 1 processes local_output_folder = tempfile.TemporaryDirectory( ).name if is_remote_output else output_folder + log.info(f'Using local output folder: {local_output_folder}') if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: + log.error(f'Output folder is not empty: {output_folder}') raise OutputFolderNotEmptyError(output_folder) if processes > 1: + log.info(f'Using multiprocessing with {processes} processes') # Download and convert the text files in parallel args = get_task_args( object_names, @@ -525,9 +567,11 @@ def convert_text_to_mds( with ProcessPoolExecutor(max_workers=processes) as executor: list(executor.map(download_and_convert_starargs, args)) + log.info('Merging MDS shards from each process') # Merge the mds shards from each of the processes into a single folder merge_shard_groups(local_output_folder) else: + log.info('Using single process for download and conversion') download_and_convert( object_names, local_output_folder, From e7bf8db640676a45865ae1b00c1698b952018e88 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 16 Jul 2024 15:59:19 -0700 Subject: [PATCH 05/57] Add convert_dataset_hf to CLI (#1348) * convert_dataset_hf * precommit * precommit * arguemnt * optino * back to fully option * typer is a pain * comma sep * checks * test * typo * clean imports * commit comments 1 * commit comments 2 (precommit hell) * script args * typer defaults * precommit * bruh * precommit * yapf * cli * update annotation * update annotation * merge * merge + refactor * typo * typo * move app * typo --------- Co-authored-by: v-chen_data --- llmfoundry/cli/cli.py | 47 +- llmfoundry/cli/data_prep_cli.py | 61 +++ llmfoundry/cli/registry_cli.py | 4 +- llmfoundry/command_utils/__init__.py | 6 + .../data_prep/convert_dataset_hf.py | 489 ++++++++++++++++++ scripts/data_prep/convert_dataset_hf.py | 429 +-------------- .../data_prep/test_convert_dataset_hf.py | 31 +- tests/data/test_dataloader.py | 63 +-- tests/data_utils.py | 32 +- 9 files changed, 658 insertions(+), 504 deletions(-) create mode 100644 llmfoundry/cli/data_prep_cli.py create mode 100644 llmfoundry/command_utils/data_prep/convert_dataset_hf.py diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py index 606f6855ff..6c4a2d12c4 100644 --- a/llmfoundry/cli/cli.py +++ b/llmfoundry/cli/cli.py @@ -1,25 +1,34 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Annotated, Optional -import typer +from typer import Argument, Typer -from llmfoundry.cli import registry_cli -from llmfoundry.command_utils import eval_from_yaml, train_from_yaml +from llmfoundry.cli import ( + data_prep_cli, + registry_cli, +) +from llmfoundry.command_utils import ( + eval_from_yaml, + train_from_yaml, +) -app = typer.Typer(pretty_exceptions_show_locals=False) +app = Typer(pretty_exceptions_show_locals=False) app.add_typer(registry_cli.app, name='registry') +app.add_typer(data_prep_cli.app, name='data_prep') @app.command(name='train') def train( - yaml_path: str = typer.Argument( - ..., - help='Path to the YAML configuration file', - ), # type: ignore - args_list: Optional[list[str]] = typer. - Argument(None, help='Additional command line arguments'), # type: ignore + yaml_path: Annotated[str, + Argument( + ..., + help='Path to the YAML configuration file', + )], + args_list: Annotated[ + Optional[list[str]], + Argument(help='Additional command line arguments')] = None, ): """Run the training with optional overrides from CLI.""" train_from_yaml(yaml_path, args_list) @@ -27,14 +36,16 @@ def train( @app.command(name='eval') def eval( - yaml_path: str = typer.Argument( - ..., - help='Path to the YAML configuration file', - ), # type: ignore - args_list: Optional[list[str]] = typer. - Argument(None, help='Additional command line arguments'), # type: ignore + yaml_path: Annotated[str, + Argument( + ..., + help='Path to the YAML configuration file', + )], + args_list: Annotated[ + Optional[list[str]], + Argument(help='Additional command line arguments')] = None, ): - """Run the training with optional overrides from CLI.""" + """Run the eval with optional overrides from CLI.""" eval_from_yaml(yaml_path, args_list) diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py new file mode 100644 index 0000000000..731a9f06f0 --- /dev/null +++ b/llmfoundry/cli/data_prep_cli.py @@ -0,0 +1,61 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Annotated, Optional + +from typer import Option, Typer + +from llmfoundry.command_utils import ( + convert_dataset_hf_from_args, +) + +app = Typer(pretty_exceptions_show_locals=False) + + +@app.command(name='convert_dataset_hf') +def convert_dataset_hf( + dataset: Annotated[str, Option(..., help='Name of the dataset')], + out_root: Annotated[str, Option(..., help='Output root directory')], + data_subset: Annotated[ + Optional[str], + Option(help='Subset of the dataset (e.g., "all" or "en")'), + ] = None, + splits: Annotated[str, + Option(help='Comma-separated list of dataset splits',), + ] = 'train, train_small, val, val_small, val_xsmall', + compression: Annotated[Optional[str], + Option(help='Compression type')] = None, + concat_tokens: Annotated[ + Optional[int], + Option(help='Concatenate tokens up to this many tokens')] = None, + tokenizer: Annotated[Optional[str], + Option(help='Tokenizer name')] = None, + tokenizer_kwargs: Annotated[ + Optional[str], + Option(help='Tokenizer keyword arguments in JSON format')] = None, + bos_text: Annotated[Optional[str], Option(help='BOS text')] = None, + eos_text: Annotated[Optional[str], Option(help='EOS text')] = None, + no_wrap: Annotated[ + bool, + Option(help='Do not wrap text across max_length boundaries'), + ] = False, + num_workers: Annotated[Optional[int], + Option(help='Number of workers')] = None, +): + """Converts dataset from HuggingFace into JSON files.""" + # Convert comma-separated splits into a list + splits_list = splits.split(',') if splits else [] + convert_dataset_hf_from_args( + dataset=dataset, + data_subset=data_subset, + splits=splits_list, + out_root=out_root, + compression=compression, + concat_tokens=concat_tokens, + tokenizer=tokenizer, + tokenizer_kwargs=tokenizer_kwargs, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + num_workers=num_workers, + ) diff --git a/llmfoundry/cli/registry_cli.py b/llmfoundry/cli/registry_cli.py index 38ada51fd9..db090cd3aa 100644 --- a/llmfoundry/cli/registry_cli.py +++ b/llmfoundry/cli/registry_cli.py @@ -3,15 +3,15 @@ from typing import Optional -import typer from rich.console import Console from rich.table import Table +from typer import Typer from llmfoundry import registry from llmfoundry.utils.registry_utils import TypedRegistry console = Console() -app = typer.Typer(pretty_exceptions_show_locals=False) +app = Typer(pretty_exceptions_show_locals=False) def _get_registries(group: Optional[str] = None) -> list[TypedRegistry]: diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 7dd8a32c36..adaaf03b6e 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -1,5 +1,9 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from llmfoundry.command_utils.data_prep.convert_dataset_hf import ( + convert_dataset_hf, + convert_dataset_hf_from_args, +) from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -20,4 +24,6 @@ 'validate_config', 'evaluate', 'eval_from_yaml', + 'convert_dataset_hf', + 'convert_dataset_hf_from_args', ] diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py new file mode 100644 index 0000000000..f9bbe6b0cf --- /dev/null +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -0,0 +1,489 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Streaming dataset conversion scripts for C4 and The Pile.""" +import json +import os +import platform +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, Iterable, Optional, Union + +import datasets as hf_datasets +import psutil +import torch +from numpy.typing import NDArray +from streaming import MDSWriter +from torch.utils.data import DataLoader, Dataset, IterableDataset +from tqdm import tqdm +from transformers import PreTrainedTokenizerBase + +from llmfoundry.data import ConcatTokensDataset, NoConcatDataset +from llmfoundry.utils.builders import build_tokenizer + + +class ConcatMode(Enum): + NO_CONCAT = 'NO_CONCAT' + CONCAT_TOKENS = 'CONCAT_TOKENS' + + +@dataclass +class DataSplitConstants: + hf_split: str + folder_split: str + raw_samples: Optional[int] + truncated_samples: Union[int, None] + + +@dataclass +class DatasetConstants: + chars_per_sample: int + chars_per_token: int + splits: Dict[str, DataSplitConstants] = field(default_factory=dict) + + def __iter__(self): + for v in self.splits.values(): + yield v + + +class TrainSmallConstants(DataSplitConstants): + + def __init__( + self, + hf_split: str = 'train', + folder_split: str = 'train_small', + raw_samples: int = 100000, + truncated_samples: int = 100000, + ): + super().__init__(hf_split, folder_split, raw_samples, truncated_samples) + + +class ValSmallConstants(DataSplitConstants): + + def __init__( + self, + hf_split: str = 'validation', + folder_split: str = 'val_small', + raw_samples: int = 10000, + truncated_samples: int = 10000, + ): + super().__init__(hf_split, folder_split, raw_samples, truncated_samples) + + +class ValXSmallConstants(DataSplitConstants): + + def __init__( + self, + hf_split: str = 'validation', + folder_split: str = 'val_xsmall', + raw_samples: int = 3000, + truncated_samples: int = 3000, + ): + super().__init__(hf_split, folder_split, raw_samples, truncated_samples) + + +pileconstants = DatasetConstants( + chars_per_sample=6212, # Computed over validation set + chars_per_token=4, # OpenAI estimate +) +pileconstants.splits['train'] = DataSplitConstants( + hf_split='train', + folder_split='train', + raw_samples=210607728, + truncated_samples=None, +) +pileconstants.splits['train_small'] = DataSplitConstants( + hf_split='train', + folder_split='train_small', + raw_samples=100000, + truncated_samples=100000, +) +pileconstants.splits['val'] = DataSplitConstants( + hf_split='validation', + folder_split='val', + raw_samples=214670, + truncated_samples=None, +) +pileconstants.splits['val_small'] = DataSplitConstants( + hf_split='validation', + folder_split='val_small', + raw_samples=10000, + truncated_samples=10000, +) +pileconstants.splits['val_xsmall'] = DataSplitConstants( + hf_split='validation', + folder_split='val_xsmall', + raw_samples=3000, + truncated_samples=3000, +) + +c4constants = DatasetConstants( + chars_per_sample=2163, # Computed over validation set + chars_per_token=4, # OpenAI estimate +) +c4constants.splits['train'] = DataSplitConstants( + hf_split='train', + folder_split='train', + raw_samples=364868892, + truncated_samples=None, +) +c4constants.splits['train_small'] = DataSplitConstants( + hf_split='train', + folder_split='train_small', + raw_samples=100000, + truncated_samples=100000, +) +c4constants.splits['val'] = DataSplitConstants( + hf_split='validation', + folder_split='val', + raw_samples=364608, + truncated_samples=None, +) +c4constants.splits['val_small'] = DataSplitConstants( + hf_split='validation', + folder_split='val_small', + raw_samples=10000, + truncated_samples=10000, +) +c4constants.splits['val_xsmall'] = DataSplitConstants( + hf_split='validation', + folder_split='val_xsmall', + raw_samples=3000, + truncated_samples=3000, +) +c4constants.splits['val_xxsmall'] = DataSplitConstants( + hf_split='validation', + folder_split='val_xxsmall', + raw_samples=100, + truncated_samples=100, +) + +CONSTS = {'c4': c4constants, 'the_pile': pileconstants} + + +def build_hf_dataset( + dataset_name: str, + split: str, + mode: ConcatMode, + max_length: Optional[int] = None, + bos_text: str = '', + eos_text: str = '', + no_wrap: bool = False, + tokenizer: PreTrainedTokenizerBase = None, + data_subset: Union[str, None] = None, +) -> IterableDataset: + """Build an IterableDataset over the HF C4 or pile source data. + + Args: + dataset_name (str): Dataset name + split (str): Split name. + mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS + max_length (int): The length of concatenated tokens + bos_text (str): text to insert at the beginning of each sequence + eos_text (str): text to insert at the end of each sequence + no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries + tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use + data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. + Typically "all" (The Pile) or "en" (c4). + + Returns: + An IterableDataset. + """ + hf_dataset = hf_datasets.load_dataset( + path=dataset_name, + name=data_subset, + split=split, + streaming=True, + ) + if mode == ConcatMode.NO_CONCAT: + dataset = NoConcatDataset(hf_dataset) + else: + if not isinstance(tokenizer, PreTrainedTokenizerBase): + raise ValueError( + f'{tokenizer=} must be of type PreTrainedTokenizerBase', + ) + if max_length is None: + raise ValueError(f'max_length must be set.') + if bos_text + eos_text == '': + test_tokens = tokenizer('test') + if test_tokens['input_ids'][ + 0] != tokenizer.bos_token_id and test_tokens['input_ids'][ + -1] != tokenizer.eos_token_id: + tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. ' + tok_error_msg += 'Concatenating with this tokenizer will result in sequences being ' + tok_error_msg += 'attached without a separating token. Please use another tokenizer, ' + tok_error_msg += 'such as facebook/opt-125m, or specify EOS/BOS text with e.g. ' + tok_error_msg += '--bos_text=<|endoftext|>.' + raise ValueError(tok_error_msg) + dataset = ConcatTokensDataset( + hf_dataset=hf_dataset, + tokenizer=tokenizer, + max_length=max_length, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + ) + return dataset + + +def _est_progress_denominator( + total_samples: int, + chars_per_sample: int, + chars_per_token: int, + mode: ConcatMode, + max_length: int, +): + est_tokens_per_sample = chars_per_sample // chars_per_token + if mode == ConcatMode.NO_CONCAT: + return total_samples + elif mode == ConcatMode.CONCAT_TOKENS: + return total_samples * est_tokens_per_sample // max_length + + +def build_dataloader( + dataset: Dataset, + batch_size: int, + num_workers: Optional[int], +) -> DataLoader: + if num_workers is None: + # Multiple workers is only supported on linux machines + if 'linux' or 'macos' in platform.platform().lower(): + num_workers = max(1, psutil.cpu_count()) + else: + num_workers = 0 + + # If using multiple workers, configure each worker to prefetch as many samples as it can, up to + # the aggregate device batch size + # If not using workers, the torch DataLoader expects the default value for prefetch_factor, + # which non-intuitively must be 2. + prefetch_factor = max( + 1, + 2 * batch_size // num_workers, + ) if num_workers > 0 else 2 + + return DataLoader( + dataset=dataset, + sampler=None, + batch_size=batch_size, + num_workers=num_workers, + prefetch_factor=prefetch_factor, + ) + + +def generate_samples( + loader: DataLoader, + truncate_num_samples: Optional[int] = None, +) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: + """Generator over samples of a dataloader. + + Args: + loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} + truncate_num_samples (Optional[int]): An optional # of samples to stop at. + + Yields: + Sample dicts. + """ + n_samples = 0 + for batch in loader: + keys = list(batch.keys()) + current_bs = len(batch[keys[0]]) + for idx in range(current_bs): + if truncate_num_samples is not None and n_samples == truncate_num_samples: + return + n_samples += 1 + yield { + k: + v[idx].numpy() if isinstance(v[idx], torch.Tensor) else v[idx] + for k, v in batch.items() + } + + +def convert_dataset_hf( + dataset: str, + data_subset: Optional[str], + splits: list[str], + out_root: str, + compression: Optional[str], + concat_tokens: Optional[int], + tokenizer: Optional[str], + tokenizer_kwargs: dict[str, Any], + bos_text: str, + eos_text: str, + no_wrap: bool, + num_workers: Optional[int], +) -> None: + """Converts HuggingFace datasets to MDS format. + + Args: + dataset (str): Name of the dataset + data_subset (Optional[str]): Subset of the dataset (e.g., "all" or "en") + splits (list[str]): Comma-separated list of dataset splits + out_root (str): Output root directory + compression (Optional[str]): Compression type + concat_tokens (Optional[int]): Concatenate tokens up to this many tokens + tokenizer (Optional[str]): Tokenizer name + tokenizer_kwargs (dict[str, Any]): Tokenizer keyword arguments + bos_text (str): BOS text + eos_text (str): EOS text + no_wrap (bool): Do not wrap text across max_length boundaries + num_workers (Optional[int]): Number of workers + + Raises: + KeyError: If constants are not defined for the split + """ + try: + dataset_constants = CONSTS[dataset] + except KeyError: + raise ValueError( + f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.', + ) + + if concat_tokens is not None and tokenizer is not None: + mode = ConcatMode.CONCAT_TOKENS + built_tokenizer = build_tokenizer(tokenizer, tokenizer_kwargs) + # we will enforce length, so suppress warnings about sequences too long for the model + built_tokenizer.model_max_length = int(1e30) + columns = {'tokens': 'ndarray:int32'} + else: + mode = ConcatMode.NO_CONCAT + built_tokenizer = None + columns = {'text': 'str'} + + for split_name in splits: + try: + split = dataset_constants.splits[split_name] + except KeyError: + raise KeyError(f'Constants not defined for split {split_name}.') + hf_split = split.hf_split + folder_split = split.folder_split + expected_num_samples = split.raw_samples + truncate_num_samples = split.truncated_samples + # Only generate the splits requested + if folder_split not in splits: + continue + + # Get samples + hf_dataset = build_hf_dataset( + dataset_name=dataset, + data_subset=data_subset, + split=hf_split, + mode=mode, + max_length=concat_tokens, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + tokenizer=built_tokenizer, + ) + loader = build_dataloader( + dataset=hf_dataset, + batch_size=512, + num_workers=num_workers, + ) + samples = generate_samples( + loader, + truncate_num_samples=truncate_num_samples, + ) + + if expected_num_samples is not None and concat_tokens is not None: + denominator = truncate_num_samples if truncate_num_samples is not None else _est_progress_denominator( + total_samples=expected_num_samples, + chars_per_sample=dataset_constants.chars_per_sample, + chars_per_token=dataset_constants.chars_per_token, + mode=mode, + max_length=concat_tokens, + ) + else: + denominator = None + + # Write samples + print(f'Converting {folder_split} to MDS format...') + print( + f'Note: the progress bar is based on the dataset length before tokenization, and may finish at a value before 100%.', + ) + with MDSWriter( + columns=columns, + out=os.path.join(out_root, folder_split), + compression=compression, + ) as out: + if denominator is not None: + for sample in tqdm( + samples, + desc=folder_split, + total=denominator, + ): + out.write(sample) + else: + for sample in tqdm(samples, desc=folder_split): + out.write(sample) + + +def convert_dataset_hf_from_args( + dataset: str, + data_subset: Optional[str], + splits: list[str], + out_root: str, + compression: Optional[str], + concat_tokens: Optional[int], + tokenizer: Optional[str], + tokenizer_kwargs: Optional[str], + bos_text: Optional[str], + eos_text: Optional[str], + no_wrap: bool, + num_workers: Optional[int], +) -> None: + """A wrapper for `convert_dataset_hf` that parses arguments. + + Args: + dataset (str): Name of the dataset + data_subset (Optional[str]): Subset of the dataset (e.g., "all" or "en") + splits (list[str]): Comma-separated list of dataset splits + out_root (str): Output root directory + compression (Optional[str]): Compression type + concat_tokens (Optional[int]): Concatenate tokens up to this many tokens + tokenizer (Optional[str]): Tokenizer name + tokenizer_kwargs (Optional[str]): Tokenizer keyword arguments in JSON format + bos_text (Optional[str]): BOS text + eos_text (Optional[str]): EOS text + no_wrap (bool): Do not wrap text across max_length boundaries + num_workers (Optional[int]): Number of workers + + Raises: + ValueError: If the output directory already contains the requested splits + ValueError: If `concat_tokens` is set but `tokenizer` is not + """ + if tokenizer_kwargs: + parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs) + else: + parsed_tokenizer_kwargs = {} + + if os.path.isdir(out_root) and len( + set(os.listdir(out_root)).intersection(set(splits)), + ) > 0: + raise ValueError( + f'--out_root={out_root} contains {os.listdir(out_root)} which cannot overlap with the requested splits {splits}.', + ) + + # Make sure we have needed concat options + if ( + concat_tokens is not None and isinstance(concat_tokens, int) and + tokenizer is None + ): + raise ValueError( + 'When setting --concat_tokens, you must specify a --tokenizer', + ) + + # now that we have validated them, change BOS/EOS to strings and convert + convert_dataset_hf( + dataset=dataset, + data_subset=data_subset, + splits=splits, + out_root=out_root, + compression=compression, + concat_tokens=concat_tokens, + tokenizer=tokenizer, + tokenizer_kwargs=parsed_tokenizer_kwargs, + bos_text=bos_text if bos_text else '', + eos_text=eos_text if eos_text else '', + no_wrap=no_wrap, + num_workers=num_workers, + ) diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py index bf7f145610..3b893868b2 100644 --- a/scripts/data_prep/convert_dataset_hf.py +++ b/scripts/data_prep/convert_dataset_hf.py @@ -2,30 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """Streaming dataset conversion scripts for C4 and The Pile.""" -import json -import os -import platform from argparse import ArgumentParser, Namespace -from dataclasses import dataclass, field -from enum import Enum -from typing import Dict, Iterable, Optional, Union -import datasets as hf_datasets -import psutil -import torch -from numpy.typing import NDArray -from streaming import MDSWriter -from torch.utils.data import DataLoader, Dataset, IterableDataset -from tqdm import tqdm -from transformers import PreTrainedTokenizerBase - -from llmfoundry.data import ConcatTokensDataset, NoConcatDataset -from llmfoundry.utils.builders import build_tokenizer - - -class ConcatMode(Enum): - NO_CONCAT = 'NO_CONCAT' - CONCAT_TOKENS = 'CONCAT_TOKENS' +from llmfoundry.command_utils import convert_dataset_hf_from_args def parse_args() -> Namespace: @@ -64,398 +43,22 @@ def parse_args() -> Namespace: parser.add_argument('--num_workers', type=int, required=False, default=None) parsed = parser.parse_args() - - if parsed.tokenizer_kwargs is not None: - parsed.tokenizer_kwargs = json.loads(parsed.tokenizer_kwargs) - else: - parsed.tokenizer_kwargs = {} - - if os.path.isdir(parsed.out_root) and len( - set(os.listdir(parsed.out_root)).intersection(set(parsed.splits)), - ) > 0: - raise ValueError( - f'--out_root={parsed.out_root} contains {os.listdir(parsed.out_root)} which cannot overlap with the requested splits {parsed.splits}.', - ) - - # Make sure we have needed concat options - if ( - parsed.concat_tokens is not None and - isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None - ): - parser.error( - 'When setting --concat_tokens, you must specify a --tokenizer', - ) - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' return parsed -@dataclass -class DataSplitConstants: - hf_split: str - folder_split: str - raw_samples: Optional[int] - truncated_samples: Union[int, None] - - -@dataclass -class DatasetConstants: - chars_per_sample: int - chars_per_token: int - splits: Dict[str, DataSplitConstants] = field(default_factory=dict) - - def __iter__(self): - for v in self.splits.values(): - yield v - - -class TrainSmallConstants(DataSplitConstants): - - def __init__( - self, - hf_split: str = 'train', - folder_split: str = 'train_small', - raw_samples: int = 100000, - truncated_samples: int = 100000, - ): - super().__init__(hf_split, folder_split, raw_samples, truncated_samples) - - -class ValSmallConstants(DataSplitConstants): - - def __init__( - self, - hf_split: str = 'validation', - folder_split: str = 'val_small', - raw_samples: int = 10000, - truncated_samples: int = 10000, - ): - super().__init__(hf_split, folder_split, raw_samples, truncated_samples) - - -class ValXSmallConstants(DataSplitConstants): - - def __init__( - self, - hf_split: str = 'validation', - folder_split: str = 'val_xsmall', - raw_samples: int = 3000, - truncated_samples: int = 3000, - ): - super().__init__(hf_split, folder_split, raw_samples, truncated_samples) - - -pileconstants = DatasetConstants( - chars_per_sample=6212, # Computed over validation set - chars_per_token=4, # OpenAI estimate -) -pileconstants.splits['train'] = DataSplitConstants( - hf_split='train', - folder_split='train', - raw_samples=210607728, - truncated_samples=None, -) -pileconstants.splits['train_small'] = DataSplitConstants( - hf_split='train', - folder_split='train_small', - raw_samples=100000, - truncated_samples=100000, -) -pileconstants.splits['val'] = DataSplitConstants( - hf_split='validation', - folder_split='val', - raw_samples=214670, - truncated_samples=None, -) -pileconstants.splits['val_small'] = DataSplitConstants( - hf_split='validation', - folder_split='val_small', - raw_samples=10000, - truncated_samples=10000, -) -pileconstants.splits['val_xsmall'] = DataSplitConstants( - hf_split='validation', - folder_split='val_xsmall', - raw_samples=3000, - truncated_samples=3000, -) - -c4constants = DatasetConstants( - chars_per_sample=2163, # Computed over validation set - chars_per_token=4, # OpenAI estimate -) -c4constants.splits['train'] = DataSplitConstants( - hf_split='train', - folder_split='train', - raw_samples=364868892, - truncated_samples=None, -) -c4constants.splits['train_small'] = DataSplitConstants( - hf_split='train', - folder_split='train_small', - raw_samples=100000, - truncated_samples=100000, -) -c4constants.splits['val'] = DataSplitConstants( - hf_split='validation', - folder_split='val', - raw_samples=364608, - truncated_samples=None, -) -c4constants.splits['val_small'] = DataSplitConstants( - hf_split='validation', - folder_split='val_small', - raw_samples=10000, - truncated_samples=10000, -) -c4constants.splits['val_xsmall'] = DataSplitConstants( - hf_split='validation', - folder_split='val_xsmall', - raw_samples=3000, - truncated_samples=3000, -) -c4constants.splits['val_xxsmall'] = DataSplitConstants( - hf_split='validation', - folder_split='val_xxsmall', - raw_samples=100, - truncated_samples=100, -) - -CONSTS = {'c4': c4constants, 'the_pile': pileconstants} - - -def build_hf_dataset( - dataset_name: str, - split: str, - mode: ConcatMode, - max_length: Optional[int] = None, - bos_text: str = '', - eos_text: str = '', - no_wrap: bool = False, - tokenizer: PreTrainedTokenizerBase = None, - data_subset: Union[str, None] = None, -) -> IterableDataset: - """Build an IterableDataset over the HF C4 or pile source data. - - Args: - dataset_name (str): Dataset name - split (str): Split name. - mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS - max_length (int): The length of concatenated tokens - bos_text (str): text to insert at the beginning of each sequence - eos_text (str): text to insert at the end of each sequence - no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries - tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use - data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. - Typically "all" (The Pile) or "en" (c4). - - Returns: - An IterableDataset. - """ - hf_dataset = hf_datasets.load_dataset( - path=dataset_name, - name=data_subset, - split=split, - streaming=True, - ) - if mode == ConcatMode.NO_CONCAT: - dataset = NoConcatDataset(hf_dataset) - else: - if not isinstance(tokenizer, PreTrainedTokenizerBase): - raise ValueError( - f'{tokenizer=} must be of type PreTrainedTokenizerBase', - ) - if max_length is None: - raise ValueError(f'max_length must be set.') - if bos_text + eos_text == '': - test_tokens = tokenizer('test') - if test_tokens['input_ids'][ - 0] != tokenizer.bos_token_id and test_tokens['input_ids'][ - -1] != tokenizer.eos_token_id: - tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. ' - tok_error_msg += 'Concatenating with this tokenizer will result in sequences being ' - tok_error_msg += 'attached without a separating token. Please use another tokenizer, ' - tok_error_msg += 'such as facebook/opt-125m, or specify EOS/BOS text with e.g. ' - tok_error_msg += '--bos_text=<|endoftext|>.' - raise ValueError(tok_error_msg) - dataset = ConcatTokensDataset( - hf_dataset=hf_dataset, - tokenizer=tokenizer, - max_length=max_length, - bos_text=bos_text, - eos_text=eos_text, - no_wrap=no_wrap, - ) - return dataset - - -def _est_progress_denominator( - total_samples: int, - chars_per_sample: int, - chars_per_token: int, - mode: ConcatMode, - max_length: int, -): - est_tokens_per_sample = chars_per_sample // chars_per_token - if mode == ConcatMode.NO_CONCAT: - return total_samples - elif mode == ConcatMode.CONCAT_TOKENS: - return total_samples * est_tokens_per_sample // max_length - - -def build_dataloader( - dataset: Dataset, - batch_size: int, - num_workers: Optional[int], -) -> DataLoader: - if num_workers is None: - # Multiple workers is only supported on linux machines - if 'linux' or 'macos' in platform.platform().lower(): - num_workers = max(1, psutil.cpu_count()) - else: - num_workers = 0 - - # If using multiple workers, configure each worker to prefetch as many samples as it can, up to - # the aggregate device batch size - # If not using workers, the torch DataLoader expects the default value for prefetch_factor, - # which non-intuitively must be 2. - prefetch_factor = max( - 1, - 2 * batch_size // num_workers, - ) if num_workers > 0 else 2 - - return DataLoader( - dataset=dataset, - sampler=None, - batch_size=batch_size, - num_workers=num_workers, - prefetch_factor=prefetch_factor, - ) - - -def generate_samples( - loader: DataLoader, - truncate_num_samples: Optional[int] = None, -) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: - """Generator over samples of a dataloader. - - Args: - loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} - truncate_num_samples (Optional[int]): An optional # of samples to stop at. - - Yields: - Sample dicts. - """ - n_samples = 0 - for batch in loader: - keys = list(batch.keys()) - current_bs = len(batch[keys[0]]) - for idx in range(current_bs): - if truncate_num_samples is not None and n_samples == truncate_num_samples: - return - n_samples += 1 - yield { - k: - v[idx].numpy() if isinstance(v[idx], torch.Tensor) else v[idx] - for k, v in batch.items() - } - - -def main(args: Namespace) -> None: - """Main: create C4/pile streaming dataset. - - Args: - args (Namespace): Commandline arguments. - """ - try: - dataset_constants = CONSTS[args.dataset] - except KeyError: - raise ValueError( - f'Constants for dataset "{args.dataset}" not found. Currently only "the_pile" and "c4" are supported.', - ) - - if args.concat_tokens is not None: - mode = ConcatMode.CONCAT_TOKENS - tokenizer = build_tokenizer(args.tokenizer, args.tokenizer_kwargs) - # we will enforce length, so suppress warnings about sequences too long for the model - tokenizer.model_max_length = int(1e30) - columns = {'tokens': 'ndarray:int32'} - else: - mode = ConcatMode.NO_CONCAT - tokenizer = None - columns = {'text': 'str'} - - for split_name in args.splits: - try: - split = dataset_constants.splits[split_name] - except KeyError: - raise KeyError(f'Constants not defined for split {split_name}.') - hf_split = split.hf_split - folder_split = split.folder_split - expected_num_samples = split.raw_samples - truncate_num_samples = split.truncated_samples - # Only generate the splits requested - if folder_split not in args.splits: - continue - - # Get samples - dataset = build_hf_dataset( - dataset_name=args.dataset, - data_subset=args.data_subset, - split=hf_split, - mode=mode, - max_length=args.concat_tokens, - bos_text=args.bos_text, - eos_text=args.eos_text, - no_wrap=args.no_wrap, - tokenizer=tokenizer, - ) - loader = build_dataloader( - dataset=dataset, - batch_size=512, - num_workers=args.num_workers, - ) - samples = generate_samples( - loader, - truncate_num_samples=truncate_num_samples, - ) - - if expected_num_samples is not None: - denominator = truncate_num_samples if truncate_num_samples is not None else _est_progress_denominator( - total_samples=expected_num_samples, - chars_per_sample=dataset_constants.chars_per_sample, - chars_per_token=dataset_constants.chars_per_token, - mode=mode, - max_length=args.concat_tokens, - ) - else: - denominator = None - - # Write samples - print(f'Converting {folder_split} to MDS format...') - print( - f'Note: the progress bar is based on the dataset length before tokenization, and may finish at a value before 100%.', - ) - with MDSWriter( - columns=columns, - out=os.path.join(args.out_root, folder_split), - compression=args.compression, - ) as out: - if denominator is not None: - for sample in tqdm( - samples, - desc=folder_split, - total=denominator, - ): - out.write(sample) - else: - for sample in tqdm(samples, desc=folder_split): - out.write(sample) - - if __name__ == '__main__': - main(parse_args()) + args = parse_args() + convert_dataset_hf_from_args( + dataset=args.dataset, + data_subset=args.data_subset, + splits=args.splits, + out_root=args.out_root, + compression=args.compression, + concat_tokens=args.concat_tokens, + tokenizer=args.tokenizer, + tokenizer_kwargs=args.tokenizer_kwargs, + bos_text=args.bos_text, + eos_text=args.eos_text, + no_wrap=args.no_wrap, + num_workers=args.num_workers, + ) diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py index 4c5d1a6bba..e09c54ca70 100644 --- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py +++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py @@ -2,29 +2,26 @@ # SPDX-License-Identifier: Apache-2.0 import os -from argparse import Namespace from pathlib import Path -from scripts.data_prep.convert_dataset_hf import main as main_hf +from llmfoundry.command_utils import convert_dataset_hf def test_download_script_from_api(tmp_path: Path): # test calling it directly path = os.path.join(tmp_path, 'my-copy-c4-1') - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': ['val_xsmall'], - 'out_root': path, - 'compression': None, - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None, - }, - ), + convert_dataset_hf( + dataset='c4', + data_subset='en', + splits=['val_xsmall'], + out_root=path, + compression=None, + concat_tokens=None, + bos_text='', + eos_text='', + no_wrap=False, + num_workers=None, + tokenizer=None, + tokenizer_kwargs={}, ) assert os.path.exists(path) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index a489002399..21d73c0d34 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -5,7 +5,6 @@ import pathlib import random import shutil -from argparse import Namespace from contextlib import nullcontext as does_not_raise from pathlib import Path from typing import ContextManager, Literal, Optional, Union @@ -22,6 +21,7 @@ from streaming import MDSWriter from streaming.base.util import clean_stale_shared_memory +from llmfoundry.command_utils import convert_dataset_hf from llmfoundry.data import build_dataloader, build_finetuning_dataloader from llmfoundry.data.finetuning.collator import ( _HF_IGNORE_INDEX, @@ -56,7 +56,6 @@ UnknownExampleTypeError, ) # yapf: enable -from scripts.data_prep.convert_dataset_hf import main as main_hf from scripts.data_prep.convert_finetuning_dataset import get_columns_and_format from tests.data_utils import ( make_tiny_conversation_ft_dataset, @@ -204,42 +203,34 @@ def test_correct_padding( path = get_abs_data_path(data_local) shutil.rmtree(path, ignore_errors=True) if pretokenize: - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': [split], - 'out_root': path, - 'compression': None, - 'concat_tokens': 2048, - 'tokenizer': tokenizer_name, - 'tokenizer_kwargs': {}, - 'bos_text': bos_text, - 'eos_text': eos_text, - 'no_wrap': False, - 'num_workers': None, - }, - ), + convert_dataset_hf( + dataset='c4', + data_subset='en', + splits=[split], + out_root=path, + compression=None, + concat_tokens=2048, + tokenizer=tokenizer_name, + tokenizer_kwargs={}, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=False, + num_workers=None, ) else: - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': [split], - 'out_root': path, - 'compression': None, - 'concat_tokens': None, - 'tokenizer': tokenizer_name, - 'tokenizer_kwargs': {}, - 'bos_text': bos_text, - 'eos_text': eos_text, - 'no_wrap': False, - 'num_workers': None, - }, - ), + convert_dataset_hf( + dataset='c4', + data_subset='en', + splits=[split], + out_root=path, + compression=None, + concat_tokens=None, + tokenizer=tokenizer_name, + tokenizer_kwargs={}, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=False, + num_workers=None, ) if not os.path.isdir(path): raise RuntimeError(f'c4 dataset at {path} not set up as expected') diff --git a/tests/data_utils.py b/tests/data_utils.py index 9653d8579a..35e11db531 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -11,7 +11,7 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om -from scripts.data_prep.convert_dataset_hf import main as main_hf # noqa: E402 +from llmfoundry.command_utils import convert_dataset_hf from scripts.data_prep.convert_dataset_json import \ main as main_json # noqa: E402 @@ -230,23 +230,19 @@ def create_c4_dataset_xxsmall(path: Path) -> str: downloaded_split = 'val_xxsmall' # very fast to convert # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': [downloaded_split], - 'out_root': c4_dir, - 'compression': None, - 'concat_tokens': 2048, - 'tokenizer': 'EleutherAI/gpt-neox-20b', - 'tokenizer_kwargs': {}, - 'bos_text': '', - 'eos_text': '<|endoftext|>', - 'no_wrap': False, - 'num_workers': 8, - }, - ), + convert_dataset_hf( + dataset='c4', + data_subset='en', + splits=[downloaded_split], + out_root=c4_dir, + compression=None, + concat_tokens=2048, + tokenizer='EleutherAI/gpt-neox-20b', + tokenizer_kwargs={}, + bos_text='', + eos_text='<|endoftext|>', + no_wrap=False, + num_workers=8, ) # copy the small downloaded_split to other c4 splits for mocking purposes From 54aeef17f14f7061d1502a2d4d280cf2e5b0a069 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 16 Jul 2024 17:27:17 -0700 Subject: [PATCH 06/57] Add missing init (#1368) * missing init * license * precommit --------- Co-authored-by: v-chen_data --- llmfoundry/command_utils/data_prep/__init__.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 llmfoundry/command_utils/data_prep/__init__.py diff --git a/llmfoundry/command_utils/data_prep/__init__.py b/llmfoundry/command_utils/data_prep/__init__.py new file mode 100644 index 0000000000..80950cb7b4 --- /dev/null +++ b/llmfoundry/command_utils/data_prep/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 From 93dd4f3e8e078a385c0bd776dd7dd77ee20526b7 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Wed, 17 Jul 2024 10:07:35 -0700 Subject: [PATCH 07/57] Make ICL dataloaders lazy (#1359) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- .../datasets/in_context_learning_evaluation.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py index c87b38b09a..8a8b9de551 100644 --- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py +++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py @@ -172,17 +172,26 @@ def __init__( self.dataset = self.dataset.map(strip_data) fewshot_rng = random.Random(fewshot_random_seed) + self._prepared = False + self.num_fewshot = num_fewshot + self.prompt_string = prompt_string + self.fewshot_rng = fewshot_rng + + def _prepare_dataset(self): self.dataset: HFDataset = self.dataset.map( self._prep_example, with_indices=True, fn_kwargs={ - 'num_fewshot': num_fewshot, - 'prompt_string': prompt_string, - 'fewshot_rng': fewshot_rng, + 'num_fewshot': self.num_fewshot, + 'prompt_string': self.prompt_string, + 'fewshot_rng': self.fewshot_rng, }, ) + self._prepared = True def __getitem__(self, index: int) -> Dict: + if not self._prepared: + self._prepare_dataset() return self.dataset[index] def __len__(self) -> int: From 221d252e37a2c8284af74b06230b474ebd1ea5b4 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Wed, 17 Jul 2024 13:33:03 -0700 Subject: [PATCH 08/57] Add option to unfuse Wqkv (#1367) * yo * logging * bro * yo * yo * yo * lint * if * datest * webacc * liny --- llmfoundry/models/layers/attention.py | 71 +++++++-- llmfoundry/models/mpt/configuration_mpt.py | 2 + llmfoundry/models/utils/config_defaults.py | 1 + tests/models/layers/test_attention.py | 160 +++++++++++++++++++++ 4 files changed, 220 insertions(+), 14 deletions(-) create mode 100644 tests/models/layers/test_attention.py diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index dde7d64cd7..8e740be2b3 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -411,6 +411,7 @@ def __init__( clip_qkv: Optional[float] = None, qk_ln: bool = False, qk_gn: bool = False, + fused_qkv: bool = True, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -426,6 +427,7 @@ def __init__( self.clip_qkv = clip_qkv self.qk_ln = qk_ln self.qk_gn = qk_gn + self.fused_qkv = fused_qkv self.d_model = d_model self.n_heads = n_heads @@ -462,7 +464,17 @@ def __init__( self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads) self.attn_dropout_p = attn_pdrop - if self.reuse_kv_layer_idx is None: + if self.reuse_kv_layer_idx is not None: + self.Wq = build_fc( + name=fc_type_name, + in_features=self.d_model, + out_features=self.d_model, + fc_kwargs=fc_type, + ) + # for param init fn; enables shape based init of fused layers + fuse_splits = [i * self.head_dim for i in range(1, self.n_heads)] + self.Wq._fused = (0, fuse_splits) + elif self.fused_qkv: self.Wqkv = build_fc( name=fc_type_name, in_features=self.d_model, @@ -482,9 +494,26 @@ def __init__( out_features=self.d_model, fc_kwargs=fc_type, ) + self.Wk = build_fc( + name=fc_type_name, + in_features=self.d_model, + out_features=self.kv_n_heads * self.head_dim, + fc_kwargs=fc_type, + ) + self.Wv = build_fc( + name=fc_type_name, + in_features=self.d_model, + out_features=self.kv_n_heads * self.head_dim, + fc_kwargs=fc_type, + ) # for param init fn; enables shape based init of fused layers - fuse_splits = [i * self.head_dim for i in range(1, self.n_heads)] - self.Wq._fused = (0, fuse_splits) + q_fuse_splits = [i * self.head_dim for i in range(1, self.n_heads)] + kv_fuse_splits = [ + i * self.head_dim for i in range(1, self.kv_n_heads) + ] + self.Wq._fused = (0, q_fuse_splits) + self.Wk._fused = (0, kv_fuse_splits) + self.Wv._fused = (0, kv_fuse_splits) if self.qk_ln or self.qk_gn: norm_size = self.head_dim if qk_gn else d_model @@ -601,19 +630,29 @@ def get_qkv( query = self.q_ln(query).to(dtype).view(q_shape) return query, key, value - qkv = self.Wqkv(x) + if self.fused_qkv: + qkv = self.Wqkv(x) - if self.clip_qkv: - qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv) + if self.clip_qkv: + qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv) + + query, key, value = qkv.split( + [ + self.d_model, + self.kv_n_heads * self.head_dim, + self.kv_n_heads * self.head_dim, + ], + dim=2, + ) + else: + query = self.Wq(x) + key = self.Wk(x) + value = self.Wv(x) - query, key, value = qkv.split( - [ - self.d_model, - self.kv_n_heads * self.head_dim, - self.kv_n_heads * self.head_dim, - ], - dim=2, - ) + if self.clip_qkv: + query = query.clamp(min=-self.clip_qkv, max=self.clip_qkv) + key = key.clamp(min=-self.clip_qkv, max=self.clip_qkv) + value = value.clamp(min=-self.clip_qkv, max=self.clip_qkv) if self.qk_ln or self.qk_gn: # Applying layernorm to qk @@ -753,6 +792,7 @@ def __init__( clip_qkv: Optional[float] = None, qk_ln: bool = False, qk_gn: bool = False, + fused_qkv: bool = True, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -770,6 +810,7 @@ def __init__( clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, + fused_qkv=fused_qkv, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, @@ -796,6 +837,7 @@ def __init__( clip_qkv: Optional[float] = None, qk_ln: bool = False, qk_gn: bool = False, + fused_qkv: bool = True, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -813,6 +855,7 @@ def __init__( clip_qkv=clip_qkv, qk_ln=qk_ln, qk_gn=qk_gn, + fused_qkv=fused_qkv, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index a1fdc25f50..3de3744745 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -70,6 +70,8 @@ def __init__( attn_impl (str): The attention implementation to use. One of 'torch' or 'flash'. qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer. + fused_qkv (bool): Whether to fuse the Wq, Wk, and Wv weight matrices in the attention layer. If True, the weights are fused into a single + Wqkv matrix, which can be faster for matmuls. If False, the weights are kept separate. Defaults to True. clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to this value. softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py index 2b6fc2f7c7..c272a52dd4 100644 --- a/llmfoundry/models/utils/config_defaults.py +++ b/llmfoundry/models/utils/config_defaults.py @@ -15,6 +15,7 @@ 'attn_impl': 'flash', 'qk_ln': False, 'qk_gn': False, + 'fused_qkv': True, 'clip_qkv': None, 'softmax_scale': None, 'attn_uses_sequence_id': False, diff --git a/tests/models/layers/test_attention.py b/tests/models/layers/test_attention.py new file mode 100644 index 0000000000..bdffe2b49f --- /dev/null +++ b/tests/models/layers/test_attention.py @@ -0,0 +1,160 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch + +from llmfoundry.models.layers.layer_builders import build_attention_layer + + +@pytest.mark.parametrize( + 'attn_name', + ['multihead_attention', 'grouped_query_attention', 'multiquery_attention'], +) +@pytest.mark.parametrize('dim', [1024]) +def test_unfused_wqkv(attn_name: str, dim: int): + d_head = 128 + n_heads = dim // d_head + + generic_attn_kwargs = { + 'd_model': dim, + 'n_heads': n_heads, + 'fc_type': { + 'name': 'torch', + }, + 'device': 'cpu', + 'attn_pdrop': 0.0, + 'attn_impl': 'torch', + 'qk_ln': False, + 'qk_gn': False, + 'clip_qkv': None, + 'softmax_scale': None, + 'sliding_window_size': -1, + } + + if attn_name == 'grouped_query_attention': + kv_n_heads = 2 + generic_attn_kwargs['kv_n_heads'] = kv_n_heads + elif attn_name == 'multiquery_attention': + kv_n_heads = 1 + elif attn_name == 'multihead_attention': + kv_n_heads = n_heads + else: + raise ValueError(f'Unknown attention name: {attn_name}') + + attn_config_fused = generic_attn_kwargs.copy() + attn_config_fused['fused_qkv'] = True + + attn_config_unfused = generic_attn_kwargs.copy() + attn_config_unfused['fused_qkv'] = False + + attn_fused = build_attention_layer( + name=attn_name, + attn_kwargs=attn_config_fused, + ) + attn_unfused = build_attention_layer( + name=attn_name, + attn_kwargs=attn_config_unfused, + ) + + # Make sure unfused attention has the same params as the fused one. + fused_wqkv = attn_fused.Wqkv.weight.detach().clone() + kv_heads_len = (fused_wqkv.shape[0] - dim) // 2 + Wq_shape_before = (attn_unfused.Wq.weight.shape, attn_unfused.Wq.bias.shape) + Wk_shape_before = (attn_unfused.Wk.weight.shape, attn_unfused.Wk.bias.shape) + Wv_shape_before = (attn_unfused.Wv.weight.shape, attn_unfused.Wv.bias.shape) + + attn_unfused.Wq.weight.data = fused_wqkv[:dim, :] + attn_unfused.Wk.weight.data = fused_wqkv[dim:dim + kv_heads_len, :] + attn_unfused.Wv.weight.data = fused_wqkv[dim + kv_heads_len:, :] + attn_unfused.out_proj.weight.data = attn_fused.out_proj.weight + attn_unfused.Wq.bias.data = attn_fused.Wqkv.bias[:dim] + attn_unfused.Wk.bias.data = attn_fused.Wqkv.bias[dim:dim + kv_heads_len] + attn_unfused.Wv.bias.data = attn_fused.Wqkv.bias[dim + kv_heads_len:] + attn_unfused.out_proj.bias.data = attn_fused.out_proj.bias + + # Make sure initialization fuse splits are as expected. + all_fuse_splits = ( + 0, + [i * d_head for i in range(1, n_heads + 2 * kv_n_heads)], + ) + q_fuse_splits = (0, [i * d_head for i in range(1, n_heads)]) + kv_fuse_splits = (0, [i * d_head for i in range(1, kv_n_heads)]) + + assert attn_fused.Wqkv._fused == all_fuse_splits + assert attn_unfused.Wq._fused == q_fuse_splits + assert attn_unfused.Wk._fused == kv_fuse_splits + assert attn_unfused.Wv._fused == kv_fuse_splits + + assert torch.allclose( + attn_fused.Wqkv.weight, + torch.cat( + [ + attn_unfused.Wq.weight, + attn_unfused.Wk.weight, + attn_unfused.Wv.weight, + ], + dim=0, + ), + ) + assert torch.allclose( + attn_fused.Wqkv.bias, + torch.cat( + [ + attn_unfused.Wq.bias, + attn_unfused.Wk.bias, + attn_unfused.Wv.bias, + ], + dim=0, + ), + ) + assert torch.allclose( + attn_fused.out_proj.weight, + attn_unfused.out_proj.weight, + ) + assert torch.allclose(attn_fused.out_proj.bias, attn_unfused.out_proj.bias) + + assert Wq_shape_before == ( + attn_unfused.Wq.weight.shape, + attn_unfused.Wq.bias.shape, + ) + assert Wk_shape_before == ( + attn_unfused.Wk.weight.shape, + attn_unfused.Wk.bias.shape, + ) + assert Wv_shape_before == ( + attn_unfused.Wv.weight.shape, + attn_unfused.Wv.bias.shape, + ) + + x1 = torch.randn(1, 1, dim) + x2 = x1.detach().clone() + x1.requires_grad = True + x2.requires_grad = True + + out_fused, _, _ = attn_fused(x1) + out_unfused, _, _ = attn_unfused(x2) + + assert torch.allclose(out_fused, out_unfused) + + # Dummy loss function is simply the sum. + loss_fused = out_fused.sum() + loss_fused.backward() + + loss_unfused = out_unfused.sum() + loss_unfused.backward() + + assert isinstance(x1.grad, torch.Tensor) + assert isinstance(x2.grad, torch.Tensor) + assert torch.allclose(x1.grad, x2.grad) + combined_grad = torch.concat( + [ + attn_unfused.Wq.weight.grad, + attn_unfused.Wk.weight.grad, + attn_unfused.Wv.weight.grad, + ], + dim=0, + ) + assert isinstance(attn_fused.Wqkv.weight.grad, torch.Tensor) + assert isinstance(combined_grad, torch.Tensor) + assert torch.allclose(attn_fused.Wqkv.weight.grad, combined_grad) From 6f87962296a04ee9cedd45cf5595cef3bbfca64f Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jul 2024 17:34:10 -0700 Subject: [PATCH 09/57] Add convert_dataset_json to CLI (#1349) * convert_dataset_json * cli * rename * precommit * rename * commit comments 1 * precommit * annotation * help * update annotation * fix imports * missing init * precommit * precommit --------- Co-authored-by: v-chen_data --- llmfoundry/cli/data_prep_cli.py | 43 ++++ llmfoundry/command_utils/__init__.py | 6 + .../data_prep/convert_dataset_json.py | 222 ++++++++++++++++++ scripts/data_prep/convert_dataset_json.py | 157 +------------ .../data_prep/test_convert_dataset_json.py | 27 +-- tests/data_utils.py | 32 ++- 6 files changed, 304 insertions(+), 183 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/convert_dataset_json.py diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index 731a9f06f0..5c5c8df388 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -7,6 +7,7 @@ from llmfoundry.command_utils import ( convert_dataset_hf_from_args, + convert_dataset_json_from_args, ) app = Typer(pretty_exceptions_show_locals=False) @@ -59,3 +60,45 @@ def convert_dataset_hf( no_wrap=no_wrap, num_workers=num_workers, ) + + +@app.command(name='convert_dataset_json') +def convert_dataset_json( + path: Annotated[str, Option(..., help='Path to the input data file')], + out_root: Annotated[str, Option(..., help='Output root directory')], + concat_tokens: Annotated[ + int, + Option( + ..., + help='Convert text to tokens and concatenate up to this many tokens', + )], + tokenizer: Annotated[str, Option(..., help='Tokenizer name')], + compression: Annotated[Optional[str], + Option(help='Compression type, if any')] = 'zstd', + split: Annotated[str, Option(help='Dataset split to process')] = 'train', + bos_text: Annotated[ + Optional[str], + Option(help='Text to insert at the beginning of each sequence')] = None, + eos_text: Annotated[ + Optional[str], + Option(help='Text to insert at the end of each sequence')] = None, + no_wrap: Annotated[ + bool, + Option(help='Do not wrap text across max_length boundaries')] = False, + num_workers: Annotated[ + Optional[int], + Option(help='Number of workers for data loading')] = None, +): + """Convert a dataset from JSON to MDS streaming format.""" + convert_dataset_json_from_args( + path=path, + split=split, + out_root=out_root, + compression=compression, + concat_tokens=concat_tokens, + tokenizer=tokenizer, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + num_workers=num_workers, + ) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index adaaf03b6e..5f147cb42a 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -4,6 +4,10 @@ convert_dataset_hf, convert_dataset_hf_from_args, ) +from llmfoundry.command_utils.data_prep.convert_dataset_json import ( + convert_dataset_json, + convert_dataset_json_from_args, +) from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -26,4 +30,6 @@ 'eval_from_yaml', 'convert_dataset_hf', 'convert_dataset_hf_from_args', + 'convert_dataset_json', + 'convert_dataset_json_from_args', ] diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py new file mode 100644 index 0000000000..9f174d1aaf --- /dev/null +++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py @@ -0,0 +1,222 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Streaming dataset conversion scripts for json files.""" +import os +from enum import Enum +from glob import glob +from typing import Optional + +import datasets as hf_datasets +from streaming import MDSWriter +from torch.utils.data import IterableDataset +from tqdm import tqdm +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from llmfoundry.data import ConcatTokensDataset, NoConcatDataset + + +class ConcatMode(Enum): + NO_CONCAT = 'NO_CONCAT' + CONCAT_TOKENS = 'CONCAT_TOKENS' + + +def build_hf_dataset( + path: str, + split: str, + mode: ConcatMode, + max_length: Optional[int] = None, + bos_text: str = '', + eos_text: str = '', + no_wrap: bool = False, + tokenizer: PreTrainedTokenizerBase = None, +) -> IterableDataset: + """Build an IterableDataset over the HF C4 or pile source data. + + Args: + dataset_name (str): Dataset name + split (str): Split name. + mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS + max_length (int): The length of concatenated tokens + bos_text (str): text to insert at the beginning of each sequence + eos_text (str): text to insert at the end of each sequence + no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries + tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use + data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. + Typically "all" (The Pile) or "en" (c4). + + Returns: + An IterableDataset. + """ + if os.path.isdir(path): + data_files = glob(f'{path}/*') + else: + data_files = path + + hf_dataset = hf_datasets.load_dataset( + 'json', + data_files=data_files, + split=split, + ) + + if mode == ConcatMode.NO_CONCAT: + dataset = NoConcatDataset(hf_dataset) + else: + if not isinstance(tokenizer, PreTrainedTokenizerBase): + raise ValueError( + f'{tokenizer=} must be of type PreTrainedTokenizerBase', + ) + if max_length is None: + raise ValueError(f'max_length must be set.') + if bos_text + eos_text == '': + test_tokens = tokenizer('test') + if test_tokens['input_ids'][ + 0] != tokenizer.bos_token_id and test_tokens['input_ids'][ + -1] != tokenizer.eos_token_id: + tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. ' + tok_error_msg += 'Concatenating with this tokenizer will result in sequences being ' + tok_error_msg += 'attached without a separating token. Please use another tokenizer, ' + tok_error_msg += 'such as facebook/opt-125m, or specify EOS/BOS text with e.g. ' + tok_error_msg += '--bos_text=<|endoftext|>.' + raise ValueError(tok_error_msg) + dataset = ConcatTokensDataset( + hf_dataset=hf_dataset, + tokenizer=tokenizer, + max_length=max_length, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + ) + return dataset + + +def convert_dataset_json( + path: str, + out_root: str, + compression: Optional[str], + concat_tokens: Optional[int], + split: str, + tokenizer: Optional[str] = None, + bos_text: str = '', + eos_text: str = '', + no_wrap: bool = False, + num_workers: Optional[int] = None, +) -> None: + """Create C4/pile streaming dataset. + + Args: + path (str): Path to the input data file + out_root (str): Output root directory + compression (Optional[str]): Compression type, if any + concat_tokens (Optional[int]): Convert text to tokens and concatenate up to this many tokens + split (str): Dataset split to process + tokenizer (Optional[str]): Tokenizer name + bos_text (str): Text to insert at the beginning of each sequence + eos_text (str): Text to insert at the end of each sequence + no_wrap (bool): Do not wrap text across max_length boundaries + num_workers (Optional[int]): Number of workers for data loading + """ + if concat_tokens is not None: + mode = ConcatMode.CONCAT_TOKENS + built_tokenizer = AutoTokenizer.from_pretrained(tokenizer) + # we will enforce length, so suppress warnings about sequences too long for the model + built_tokenizer.model_max_length = int(1e30) + columns = {'tokens': 'ndarray:int32'} + else: + mode = ConcatMode.NO_CONCAT + built_tokenizer = None + columns = {'text': 'str'} + + # Get samples + dataset = build_hf_dataset( + path=path, + split=split, + mode=mode, + max_length=concat_tokens, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + tokenizer=built_tokenizer, + ) + + print('here') + + # Write samples + print(f'Converting to MDS format...') + print( + f'Note that the progress bar is based on the dataset length before tokenization.', + ) + print(f'It will finish at a value below 100% if tokenizing') + with MDSWriter( + columns=columns, + out=os.path.join(out_root), + compression=compression, + ) as out: + for sample in tqdm(dataset): + out.write(sample) + + +def convert_dataset_json_from_args( + path: str, + out_root: str, + compression: Optional[str], + concat_tokens: Optional[int], + split: str, + tokenizer: Optional[str] = None, + bos_text: Optional[str] = None, + eos_text: Optional[str] = None, + no_wrap: bool = False, + num_workers: Optional[int] = None, +) -> None: + """A wrapper for `convert_dataset_json` that parses arguments. + + Args: + path (str): Path to the input data file + out_root (str): Output root directory + compression (Optional[str]): Compression type, if any + concat_tokens (Optional[int]): Convert text to tokens and concatenate up to this many tokens + split (str): Dataset split to process + tokenizer (Optional[str]): Tokenizer name + bos_text (Optional[str]): Text to insert at the beginning of each sequence + eos_text (Optional[str]): Text to insert at the end of each sequence + no_wrap (bool): Do not wrap text across max_length boundaries + num_workers (Optional[int]): Number of workers for data loading + + Raises: + ValueError: If the out_root directory exists and contains files that overlap with the requested splits + ValueError: If concat_tokens is set and a tokenizer is not provided + """ + if os.path.isdir(out_root) and len( + set(os.listdir(out_root)).intersection(set(split)), + ) > 0: + raise ValueError( + f'--out_root={out_root} contains {os.listdir(out_root)} which cannot overlap with the requested splits {split}.', + ) + + # Make sure we have needed concat options + if ( + concat_tokens is not None and isinstance(concat_tokens, int) and + tokenizer is None + ): + ValueError( + 'When setting --concat_tokens, you must specify a --tokenizer', + ) + + # now that we have validated them, change BOS/EOS to strings + if bos_text is None: + bos_text = '' + if eos_text is None: + eos_text = '' + + convert_dataset_json( + path=path, + out_root=out_root, + compression=compression, + concat_tokens=concat_tokens, + split=split, + tokenizer=tokenizer, + bos_text=bos_text, + eos_text=eos_text, + no_wrap=no_wrap, + num_workers=num_workers, + ) diff --git a/scripts/data_prep/convert_dataset_json.py b/scripts/data_prep/convert_dataset_json.py index 37b0465692..5a6927ac75 100644 --- a/scripts/data_prep/convert_dataset_json.py +++ b/scripts/data_prep/convert_dataset_json.py @@ -2,24 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """Streaming dataset conversion scripts for json files.""" -import os from argparse import ArgumentParser, Namespace -from enum import Enum -from glob import glob -from typing import Optional -import datasets as hf_datasets -from streaming import MDSWriter -from torch.utils.data import IterableDataset -from tqdm import tqdm -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from llmfoundry.data import ConcatTokensDataset, NoConcatDataset - - -class ConcatMode(Enum): - NO_CONCAT = 'NO_CONCAT' - CONCAT_TOKENS = 'CONCAT_TOKENS' +from llmfoundry.command_utils import convert_dataset_json_from_args def parse_args() -> Namespace: @@ -46,145 +31,19 @@ def parse_args() -> Namespace: parser.add_argument('--no_wrap', default=False, action='store_true') parsed = parser.parse_args() - - if os.path.isdir(parsed.out_root) and len( - set(os.listdir(parsed.out_root)).intersection(set(parsed.split)), - ) > 0: - raise ValueError( - f'--out_root={parsed.out_root} contains {os.listdir(parsed.out_root)} which cannot overlap with the requested splits {parsed.splits}.', - ) - - # Make sure we have needed concat options - if ( - parsed.concat_tokens is not None and - isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None - ): - parser.error( - 'When setting --concat_tokens, you must specify a --tokenizer', - ) - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' return parsed -def build_hf_dataset( - path: str, - split: str, - mode: ConcatMode, - max_length: Optional[int] = None, - bos_text: str = '', - eos_text: str = '', - no_wrap: bool = False, - tokenizer: PreTrainedTokenizerBase = None, -) -> IterableDataset: - """Build an IterableDataset over the HF C4 or pile source data. - - Args: - dataset_name (str): Dataset name - split (str): Split name. - mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS - max_length (int): The length of concatenated tokens - bos_text (str): text to insert at the beginning of each sequence - eos_text (str): text to insert at the end of each sequence - no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries - tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use - data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. - Typically "all" (The Pile) or "en" (c4). - - Returns: - An IterableDataset. - """ - if os.path.isdir(path): - data_files = glob(f'{path}/*') - else: - data_files = path - - hf_dataset = hf_datasets.load_dataset( - 'json', - data_files=data_files, - split=split, - ) - - if mode == ConcatMode.NO_CONCAT: - dataset = NoConcatDataset(hf_dataset) - else: - if not isinstance(tokenizer, PreTrainedTokenizerBase): - raise ValueError( - f'{tokenizer=} must be of type PreTrainedTokenizerBase', - ) - if max_length is None: - raise ValueError(f'max_length must be set.') - if bos_text + eos_text == '': - test_tokens = tokenizer('test') - if test_tokens['input_ids'][ - 0] != tokenizer.bos_token_id and test_tokens['input_ids'][ - -1] != tokenizer.eos_token_id: - tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. ' - tok_error_msg += 'Concatenating with this tokenizer will result in sequences being ' - tok_error_msg += 'attached without a separating token. Please use another tokenizer, ' - tok_error_msg += 'such as facebook/opt-125m, or specify EOS/BOS text with e.g. ' - tok_error_msg += '--bos_text=<|endoftext|>.' - raise ValueError(tok_error_msg) - dataset = ConcatTokensDataset( - hf_dataset=hf_dataset, - tokenizer=tokenizer, - max_length=max_length, - bos_text=bos_text, - eos_text=eos_text, - no_wrap=no_wrap, - ) - return dataset - - -def main(args: Namespace) -> None: - """Main: create C4/pile streaming dataset. - - Args: - args (Namespace): Commandline arguments. - """ - if args.concat_tokens is not None: - mode = ConcatMode.CONCAT_TOKENS - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) - # we will enforce length, so suppress warnings about sequences too long for the model - tokenizer.model_max_length = int(1e30) - columns = {'tokens': 'ndarray:int32'} - else: - mode = ConcatMode.NO_CONCAT - tokenizer = None - columns = {'text': 'str'} - - # Get samples - dataset = build_hf_dataset( +if __name__ == '__main__': + args = parse_args() + convert_dataset_json_from_args( path=args.path, + out_root=args.out_root, + compression=args.compression, + concat_tokens=args.concat_tokens, split=args.split, - mode=mode, - max_length=args.concat_tokens, + tokenizer=args.tokenizer, bos_text=args.bos_text, eos_text=args.eos_text, no_wrap=args.no_wrap, - tokenizer=tokenizer, - ) - - print('here') - - # Write samples - print(f'Converting to MDS format...') - print( - f'Note that the progress bar is based on the dataset length before tokenization.', ) - print(f'It will finish at a value below 100% if tokenizing') - with MDSWriter( - columns=columns, - out=os.path.join(args.out_root), - compression=args.compression, - ) as out: - for sample in tqdm(dataset): - out.write(sample) - - -if __name__ == '__main__': - main(parse_args()) diff --git a/tests/a_scripts/data_prep/test_convert_dataset_json.py b/tests/a_scripts/data_prep/test_convert_dataset_json.py index 912e44cd0c..4f70a35637 100644 --- a/tests/a_scripts/data_prep/test_convert_dataset_json.py +++ b/tests/a_scripts/data_prep/test_convert_dataset_json.py @@ -2,28 +2,23 @@ # SPDX-License-Identifier: Apache-2.0 import os -from argparse import Namespace from pathlib import Path -from scripts.data_prep.convert_dataset_json import main as main_json +from llmfoundry.command_utils import convert_dataset_json def test_json_script_from_api(tmp_path: Path): # test calling it directly path = os.path.join(tmp_path, 'my-copy-arxiv-1') - main_json( - Namespace( - **{ - 'path': 'scripts/data_prep/example_data/arxiv.jsonl', - 'out_root': path, - 'compression': None, - 'split': 'train', - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None, - }, - ), + convert_dataset_json( + path='scripts/data_prep/example_data/arxiv.jsonl', + out_root=path, + compression=None, + split='train', + concat_tokens=None, + bos_text='', + eos_text='', + no_wrap=False, + num_workers=None, ) assert os.path.exists(path) diff --git a/tests/data_utils.py b/tests/data_utils.py index 35e11db531..ea64943735 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -4,16 +4,16 @@ import json import os import shutil -from argparse import Namespace from pathlib import Path from typing import Dict, List, Optional from omegaconf import DictConfig from omegaconf import OmegaConf as om -from llmfoundry.command_utils import convert_dataset_hf -from scripts.data_prep.convert_dataset_json import \ - main as main_json # noqa: E402 +from llmfoundry.command_utils import ( + convert_dataset_hf, + convert_dataset_json, +) def make_tiny_ft_dataset( @@ -265,20 +265,16 @@ def create_arxiv_dataset(path: Path) -> str: if not os.getcwd().endswith('scripts'): arxiv_path = os.path.join('scripts', arxiv_path) - main_json( - Namespace( - **{ - 'path': arxiv_path, - 'out_root': arxiv_dir, - 'compression': None, - 'split': downloaded_split, - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None, - }, - ), + convert_dataset_json( + path=arxiv_path, + out_root=arxiv_dir, + compression=None, + split=downloaded_split, + concat_tokens=None, + bos_text='', + eos_text='', + no_wrap=False, + num_workers=None, ) return arxiv_dir From 59b9c2ab7ae9951ef0c02597847c2447b93bba00 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jul 2024 17:56:13 -0700 Subject: [PATCH 10/57] Add convert_text_to_mds to CLI (#1352) * cli * cli * ignore * import * naming * typo * test * commit comments 1 * precommit * typo * typo * arg_str * annotation + help * update annotation * typo * precommit * precommit * pr comments --------- Co-authored-by: v-chen_data --- llmfoundry/cli/data_prep_cli.py | 46 ++ llmfoundry/command_utils/__init__.py | 6 + .../data_prep/convert_text_to_mds.py | 582 ++++++++++++++++++ scripts/data_prep/convert_text_to_mds.py | 543 +--------------- .../data_prep/test_convert_text_to_mds.py | 18 +- 5 files changed, 650 insertions(+), 545 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/convert_text_to_mds.py diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index 5c5c8df388..3ca53f4104 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -3,11 +3,13 @@ from typing import Annotated, Optional +import psutil from typer import Option, Typer from llmfoundry.command_utils import ( convert_dataset_hf_from_args, convert_dataset_json_from_args, + convert_text_to_mds_from_args, ) app = Typer(pretty_exceptions_show_locals=False) @@ -102,3 +104,47 @@ def convert_dataset_json( no_wrap=no_wrap, num_workers=num_workers, ) + + +@app.command(name='convert_text_to_mds') +def convert_text_to_mds( + output_folder: Annotated[str, Option(..., help='The folder to write output to')], + input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')], + concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')], + tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')], + bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None, + eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None, + compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd', + use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False, + no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False, + processes: Annotated[int, Option( + help='The number of processes to use to download and convert the dataset', + )] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore + reprocess: Annotated[bool, Option( + help= + 'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.', + )] = False, + trust_remote_code: Annotated[bool, Option( + help='If true, allows custom code to be executed to load the tokenizer', + )] = False, + logging_level: Annotated[str, Option( + help='Logging level for the script. Default is INFO.', + )] = 'INFO', + +): + """Convert text files to MDS streaming format.""" + convert_text_to_mds_from_args( + output_folder=output_folder, + input_folder=input_folder, + compression=compression, + concat_tokens=concat_tokens, + tokenizer_name=tokenizer, + bos_text=bos_text, + eos_text=eos_text, + use_tokenizer_eos=use_tokenizer_eos, + no_wrap=no_wrap, + processes=processes, + reprocess=reprocess, + trust_remote_code=trust_remote_code, + logging_level=logging_level, + ) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 5f147cb42a..995c5345e7 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -8,6 +8,10 @@ convert_dataset_json, convert_dataset_json_from_args, ) +from llmfoundry.command_utils.data_prep.convert_text_to_mds import ( + convert_text_to_mds, + convert_text_to_mds_from_args, +) from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -32,4 +36,6 @@ 'convert_dataset_hf_from_args', 'convert_dataset_json', 'convert_dataset_json_from_args', + 'convert_text_to_mds', + 'convert_text_to_mds_from_args', ] diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py new file mode 100644 index 0000000000..14afe279fd --- /dev/null +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -0,0 +1,582 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import math +import os +import tempfile +from concurrent.futures import ProcessPoolExecutor +from functools import partial +from glob import glob +from typing import Dict, Iterable, List, Optional, Tuple, cast + +import numpy as np +from composer.utils import ( + ObjectStore, + maybe_create_object_store_from_uri, + parse_uri, +) +from numpy.typing import NDArray +from streaming import MDSWriter +from tqdm import tqdm +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from llmfoundry.data.data import AbstractConcatTokensDataset +from llmfoundry.utils.data_prep_utils import ( + DownloadingIterable, + download_file, + merge_shard_groups, +) +from llmfoundry.utils.exceptions import ( + InputFolderMissingDataError, + OutputFolderNotEmptyError, +) + +log = logging.getLogger(__name__) + +DONE_FILENAME = '.text_to_mds_conversion_done' + + +class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset): + """An IterableDataset that returns token samples for MDSWriter from files. + + Returns dicts of {'tokens': ndarray:int32} + + Each file is considered a sequence. + """ + + def __init__( + self, + files: Iterable[str], + tokenizer: PreTrainedTokenizerBase, + max_length: int, + bos_text: str, + eos_text: str, + no_wrap: bool, + ): + self.files = files + super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) + log.info(f'Initialized ConcatTokensFromFilesDataset.') + + def __iter__(self) -> Iterable[Dict[str, NDArray]]: + log.info( + 'Starting iteration over files in ConcatTokensFromFilesDataset', + ) + buffer = [] + for file in self.files: + log.info(f'Processing file: {file}') + with open(file, 'r') as f: + buffer += self.bos_tokens + first_chunk = True + # Read the file in 1MB chunks to avoid memory issues + for chunk in iter(partial(f.read, 1000000), ''): + # Tokenize the chunk + encoded = self.tokenizer( + chunk, + truncation=False, + padding=False, + ) + iids = encoded['input_ids'] + + # If this is not the first chunk, remove the BOS token + if not first_chunk: + if iids[0] == self.tokenizer.bos_token_id: + iids = iids[1:] + + # Add the tokens to the buffer + buffer += iids + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self. + max_length:] if self.should_wrap else [] + yield { + 'tokens': np.asarray(concat_sample, dtype=np.int32), + } + + first_chunk = False + + # Add the EOS token to the buffer to separate files. + buffer += self.eos_tokens + + # Yield any remaining samples of size max_length. + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length:] if self.should_wrap else [] + yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} + + log.info( + 'Finished iterating over files in ConcatTokensFromFilesDataset', + ) + + +def get_object_names(input_folder: str) -> List[str]: + """Get object names from a local or remote folder. + + Args: + input_folder (str): local or remote folder path. + """ + object_store = maybe_create_object_store_from_uri(input_folder) + if object_store is not None: + _, _, folder_prefix = parse_uri(input_folder) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + log.info(f'Found {len(names)} text files in remote storage') + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.txt')) + ] + # return names, sizes + log.info(f'Found {len(names)} text files at {input_folder}') + + return names + + +def get_task_args( + object_names: List[str], + output_root: str, + input_folder: str, + n_groups: int, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + trust_remote_code: bool, +) -> Iterable: + """Get download_and_convert arguments split across n_groups. + + Each group handles a portion of object_names. + + Args: + object_names (List[str]): Names of objects to process + output_root (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + n_groups (int): Number of groups to split the object names into + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + log.info( + f'Preparing task arguments for {len(object_names)} objects across {n_groups} groups', + ) + num_objects = len(object_names) + objs_per_group = math.ceil(num_objects / n_groups) + for group, i in enumerate(range(0, num_objects, objs_per_group)): + output_subdir = os.path.join(output_root, str(group)) + log.info( + f'Created task for group {group} with {min(objs_per_group, num_objects - i)} objects', + ) + yield ( + object_names[i:min(i + objs_per_group, num_objects)], + output_subdir, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + + +def download_and_convert_starargs(args: Tuple): + """Helper function to call download_and_convert with star args. + + This helps us use download_and_convert with multiprocessing. + """ + return download_and_convert(*args) + + +def download_and_convert( + file_names: List[str], + output_folder: str, + input_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + trust_remote_code: bool, +): + """Downloads and converts text files to MDS format. + + Args: + file_names (List[str]): Files to process + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + log.info(f'Starting download and conversion for {len(file_names)} files') + + object_store = maybe_create_object_store_from_uri(input_folder) + + # Download file_names + with tempfile.TemporaryDirectory() as tmp_dir: + log.info(f'Created temporary directory: {tmp_dir}') + downloading_iter = DownloadingIterable( + object_names=file_names, + output_folder=tmp_dir, + object_store=object_store, + ) + log.info(f'Initializing tokenizer: {tokenizer_name}') + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensFromFilesDataset( + files=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + columns = {'tokens': 'ndarray:int32'} + + log.info('Converting to MDS format...') + with MDSWriter( + out=output_folder, + columns=columns, + compression=compression, + ) as out: + for sample in tqdm(dataset): + out.write(sample) + + log.info(f'Completed download and conversion for {len(file_names)} files') + + +def is_remote_path(path: str) -> bool: + """Checks whether a path is a remote path. + + Args: + path (str): path to check + """ + backend, _, _ = parse_uri(path) + return backend != '' + + +def is_already_processed( + output_root: str, + args_str: str, + object_names: List[str], +) -> bool: + """Determines whether a group of text files has already been processed. + + Checks the done fie at output root to determine this. + + Args: + output_root (str): Output folder where a done file may exist + args_str (str): String representation of the arguments + object_names (List[str]): Names of objects to convert to MDS format + """ + log.info( + f'Checking if {len(object_names)} objects have already been processed in {output_root}', + ) + + # Retrieve the done file contents + output_object_store = maybe_create_object_store_from_uri(output_root) + if output_object_store is not None: + # Download and read the done file from the remote object store + _, _, output_folder_prefix = parse_uri(output_root) + try: + with tempfile.TemporaryDirectory() as tmp_dir: + done_file = os.path.join(tmp_dir, DONE_FILENAME) + download_file( + object_store=output_object_store, + object_name=os.path.join( + output_folder_prefix, + DONE_FILENAME, + ), + output_filename=done_file, + ) + with open(done_file) as df: + done_file_contents = df.read().splitlines() + log.info(f'Retrieved done file contents from remote storage') + except FileNotFoundError: + log.info('Done file not found in remote storage') + return False + else: + # Read the local done file + done_file = os.path.join(output_root, DONE_FILENAME) + if not os.path.isfile(done_file): + log.info('Done file not found in local storage') + return False + with open(done_file) as df: + done_file_contents = df.read().splitlines() + log.info(f'Retrieved done file contents from local storage') + + # Compare the arguments + prev_args_str = done_file_contents[0] + if prev_args_str != args_str: + log.info('Arguments have changed, reprocessing required') + return False + + # Compare file names + prev_names = done_file_contents[1:] + if len(prev_names) != len(object_names): + log.info('Number of files has changed, reprocessing required') + return False + for idx, prev_name in enumerate(prev_names): + if object_names[idx] != prev_name: + log.info('File names have changed, reprocessing required') + return False + + log.info('All files have already been processed') + return True + + +def write_done_file(folder: str, args_str: str, object_names: List[str]): + """Write a file to signify completion. + + This the done file includes the arguments to processing and + a list of objects that were processed. + + Args: + folder (str): Folder to write the done file to + args_str (str): String representation of arguments + object_names (List[str]): List of objects to convert to MDS format + """ + with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + log.info(f'Writing done file.') + done_file.write('\n'.join([args_str] + object_names) + '\n') + log.info(f'Done file written successfully') + + +def convert_text_to_mds( + tokenizer_name: str, + output_folder: str, + input_folder: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + processes: int, + args_str: str, + reprocess: bool, + trust_remote_code: bool, +): + """Convert a folder of text files to MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + processes (int): The number of processes to use. + args_str (str): String representation of the arguments + reprocess (bool): Whether to always reprocess the given folder of text files + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + """ + is_remote_output = is_remote_path(output_folder) + log.info(f'Output is remote: {is_remote_output}') + + object_names = get_object_names(input_folder) + if len(object_names) == 0: + log.error(f'No text files found in input folder: {input_folder}') + raise InputFolderMissingDataError(input_folder) + + # Check if the text files in the bucket have already been processed. + if not reprocess and is_already_processed( + output_folder, + args_str, + object_names, + ): + log.info( + f'Input folder {input_folder} is already processed at {output_folder} and ' + + + 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.', + ) + return + + # Use a temporary local directory if the output is remote and there are more than 1 processes + local_output_folder = tempfile.TemporaryDirectory( + ).name if is_remote_output else output_folder + log.info(f'Using local output folder: {local_output_folder}') + + if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: + log.error(f'Output folder is not empty: {output_folder}') + raise OutputFolderNotEmptyError(output_folder) + + if processes > 1: + log.info(f'Using multiprocessing with {processes} processes') + # Download and convert the text files in parallel + args = get_task_args( + object_names, + local_output_folder, + input_folder, + processes, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_and_convert_starargs, args)) + + log.info('Merging MDS shards from each process') + # Merge the mds shards from each of the processes into a single folder + merge_shard_groups(local_output_folder) + else: + log.info('Using single process for download and conversion') + download_and_convert( + object_names, + local_output_folder, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + trust_remote_code, + ) + + # Write a done file with the args and object names + write_done_file(local_output_folder, args_str, object_names) + + if is_remote_output: + # Upload the local output to the remote location + output_object_store = cast( + ObjectStore, + maybe_create_object_store_from_uri(output_folder), + ) + _, _, output_folder_prefix = parse_uri(output_folder) + files_to_upload = os.listdir(local_output_folder) + + for file in files_to_upload: + assert not os.path.isdir(file) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object( + remote_path, + os.path.join(local_output_folder, file), + ) + + +def _configure_logging(logging_level: str): + """Configure logging. + + Args: + logging_level (str): Logging level. + """ + logging.basicConfig( + format= + f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', + ) + logging_level = logging_level.upper() + logging.getLogger('llmfoundry').setLevel(logging_level) + logging.getLogger(__name__).setLevel(logging_level) + log.info(f'Logging level set to {logging_level}') + + +def convert_text_to_mds_from_args( + output_folder: str, + input_folder: str, + compression: str, + concat_tokens: int, + tokenizer_name: str, + bos_text: Optional[str], + eos_text: Optional[str], + use_tokenizer_eos: bool, + no_wrap: bool, + processes: int, + reprocess: bool, + trust_remote_code: bool, + logging_level: str, +) -> None: + """A wrapper for `convert_text_to_mds` to parse arguments. + + Args: + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + compression (str): The compression algorithm to use for MDS writing + concat_tokens (int): Concatenate up to this many tokens + tokenizer_name (str): The name of the tokenizer to use + bos_text (Optional[str]): The text to prepend to each example to separate concatenated examples + eos_text (Optional[str]): The text to append to each example to separate concatenated examples + use_tokenizer_eos (bool): Use the EOS text from the tokenizer + no_wrap (bool): Whether to let text examples wrap across multiple training examples + processes (int): The number of processes to use to download and convert the dataset + reprocess (bool): If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters. + trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer + logging_level (str): Logging level for the script. Default is INFO. + + Raises: + ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None + """ + if use_tokenizer_eos: + # Ensure that eos text is not specified twice. + if eos_text is not None: + ValueError( + 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', + ) + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + eos_text = tokenizer.eos_token + + # now that we have validated them, change BOS/EOS to strings + if bos_text is None: + bos_text = '' + if eos_text is None: + eos_text = '' + _configure_logging(logging_level) + + # Define args for _args_str + args = { + 'tokenizer': tokenizer_name, + 'output_folder': output_folder, + 'input_folder': input_folder, + 'compression': compression, + 'concat_tokens': concat_tokens, + 'eos_text': eos_text, + 'bos_text': bos_text, + 'no_wrap': no_wrap, + 'processes': processes, + 'reprocess': reprocess, + 'trust_remote_code': trust_remote_code, + } + convert_text_to_mds( + tokenizer_name=tokenizer_name, + output_folder=output_folder, + input_folder=input_folder, + concat_tokens=concat_tokens, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + compression=compression, + processes=processes, + reprocess=reprocess, + trust_remote_code=trust_remote_code, + args_str=str(args), + ) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 8af8280465..c808fa871f 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -2,115 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import logging -import math -import os -import tempfile from argparse import ArgumentParser, Namespace -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from glob import glob -from typing import Dict, Iterable, List, Tuple, cast -import numpy as np import psutil -from composer.utils import ( - ObjectStore, - maybe_create_object_store_from_uri, - parse_uri, -) -from numpy.typing import NDArray -from streaming import MDSWriter -from tqdm import tqdm -from transformers import AutoTokenizer, PreTrainedTokenizerBase -from llmfoundry.data.data import AbstractConcatTokensDataset -from llmfoundry.utils.data_prep_utils import ( - DownloadingIterable, - download_file, - merge_shard_groups, -) -from llmfoundry.utils.exceptions import ( - InputFolderMissingDataError, - OutputFolderNotEmptyError, -) +from llmfoundry.command_utils import convert_text_to_mds_from_args log = logging.getLogger(__name__) DONE_FILENAME = '.text_to_mds_conversion_done' -class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset): - """An IterableDataset that returns token samples for MDSWriter from files. - - Returns dicts of {'tokens': ndarray:int32} - - Each file is considered a sequence. - """ - - def __init__( - self, - files: Iterable[str], - tokenizer: PreTrainedTokenizerBase, - max_length: int, - bos_text: str, - eos_text: str, - no_wrap: bool, - ): - self.files = files - super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) - log.info(f'Initialized ConcatTokensFromFilesDataset.') - - def __iter__(self) -> Iterable[Dict[str, NDArray]]: - log.info( - 'Starting iteration over files in ConcatTokensFromFilesDataset', - ) - buffer = [] - for file in self.files: - log.info(f'Processing file: {file}') - with open(file, 'r') as f: - buffer += self.bos_tokens - first_chunk = True - # Read the file in 1MB chunks to avoid memory issues - for chunk in iter(partial(f.read, 1000000), ''): - # Tokenize the chunk - encoded = self.tokenizer( - chunk, - truncation=False, - padding=False, - ) - iids = encoded['input_ids'] - - # If this is not the first chunk, remove the BOS token - if not first_chunk: - if iids[0] == self.tokenizer.bos_token_id: - iids = iids[1:] - - # Add the tokens to the buffer - buffer += iids - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self. - max_length:] if self.should_wrap else [] - yield { - 'tokens': np.asarray(concat_sample, dtype=np.int32), - } - - first_chunk = False - - # Add the EOS token to the buffer to separate files. - buffer += self.eos_tokens - - # Yield any remaining samples of size max_length. - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self.max_length:] if self.should_wrap else [] - yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} - - log.info( - 'Finished iterating over files in ConcatTokensFromFilesDataset', - ) - - def parse_args() -> Namespace: """Parse commandline arguments.""" parser = ArgumentParser( @@ -211,454 +113,23 @@ def parse_args() -> Namespace: help='Logging level for the script. Default is INFO.', ) parsed = parser.parse_args() - - # Set eos token. - if parsed.use_tokenizer_eos: - # Ensure that eos text is not specified twice. - if parsed.eos_text is not None: - parser.error( - 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.', - ) - tokenizer = AutoTokenizer.from_pretrained( - parsed.tokenizer, - trust_remote_code=parsed.trust_remote_code, - ) - parsed.eos_text = tokenizer.eos_token - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' return parsed -def get_object_names(input_folder: str) -> List[str]: - """Get object names from a local or remote folder. - - Args: - input_folder (str): local or remote folder path. - """ - object_store = maybe_create_object_store_from_uri(input_folder) - if object_store is not None: - _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - log.info(f'Found {len(names)} text files in remote storage') - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.txt')) - ] - # return names, sizes - log.info(f'Found {len(names)} text files at {input_folder}') - - return names - - -def get_task_args( - object_names: List[str], - output_root: str, - input_folder: str, - n_groups: int, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - trust_remote_code: bool, -) -> Iterable: - """Get download_and_convert arguments split across n_groups. - - Each group handles a portion of object_names. - - Args: - object_names (List[str]): Names of objects to process - output_root (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - n_groups (int): Number of groups to split the object names into - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - log.info( - f'Preparing task arguments for {len(object_names)} objects across {n_groups} groups', - ) - num_objects = len(object_names) - objs_per_group = math.ceil(num_objects / n_groups) - for group, i in enumerate(range(0, num_objects, objs_per_group)): - output_subdir = os.path.join(output_root, str(group)) - log.info( - f'Created task for group {group} with {min(objs_per_group, num_objects - i)} objects', - ) - yield ( - object_names[i:min(i + objs_per_group, num_objects)], - output_subdir, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - - -def download_and_convert_starargs(args: Tuple): - """Helper function to call download_and_convert with star args. - - This helps us use download_and_convert with multiprocessing. - """ - return download_and_convert(*args) - - -def download_and_convert( - file_names: List[str], - output_folder: str, - input_folder: str, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - trust_remote_code: bool, -): - """Downloads and converts text files to MDS format. - - Args: - file_names (List[str]): Files to process - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - log.info(f'Starting download and conversion for {len(file_names)} files') - - object_store = maybe_create_object_store_from_uri(input_folder) - - # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - log.info(f'Created temporary directory: {tmp_dir}') - downloading_iter = DownloadingIterable( - object_names=file_names, - output_folder=tmp_dir, - object_store=object_store, - ) - log.info(f'Initializing tokenizer: {tokenizer_name}') - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - trust_remote_code=trust_remote_code, - ) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensFromFilesDataset( - files=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) - - columns = {'tokens': 'ndarray:int32'} - - log.info('Converting to MDS format...') - with MDSWriter( - out=output_folder, - columns=columns, - compression=compression, - ) as out: - for sample in tqdm(dataset): - out.write(sample) - - log.info(f'Completed download and conversion for {len(file_names)} files') - - -def is_remote_path(path: str) -> bool: - """Checks whether a path is a remote path. - - Args: - path (str): path to check - """ - backend, _, _ = parse_uri(path) - return backend != '' - - -def is_already_processed( - output_root: str, - args_str: str, - object_names: List[str], -) -> bool: - """Determines whether a group of text files has already been processed. - - Checks the done fie at output root to determine this. - - Args: - output_root (str): Output folder where a done file may exist - args_str (str): String representation of the arguments - object_names (List[str]): Names of objects to convert to MDS format - """ - log.info( - f'Checking if {len(object_names)} objects have already been processed in {output_root}', - ) - - # Retrieve the done file contents - output_object_store = maybe_create_object_store_from_uri(output_root) - if output_object_store is not None: - # Download and read the done file from the remote object store - _, _, output_folder_prefix = parse_uri(output_root) - try: - with tempfile.TemporaryDirectory() as tmp_dir: - done_file = os.path.join(tmp_dir, DONE_FILENAME) - download_file( - object_store=output_object_store, - object_name=os.path.join( - output_folder_prefix, - DONE_FILENAME, - ), - output_filename=done_file, - ) - with open(done_file) as df: - done_file_contents = df.read().splitlines() - log.info(f'Retrieved done file contents from remote storage') - except FileNotFoundError: - log.info('Done file not found in remote storage') - return False - else: - # Read the local done file - done_file = os.path.join(output_root, DONE_FILENAME) - if not os.path.isfile(done_file): - log.info('Done file not found in local storage') - return False - with open(done_file) as df: - done_file_contents = df.read().splitlines() - log.info(f'Retrieved done file contents from local storage') - - # Compare the arguments - prev_args_str = done_file_contents[0] - if prev_args_str != args_str: - log.info('Arguments have changed, reprocessing required') - return False - - # Compare file names - prev_names = done_file_contents[1:] - if len(prev_names) != len(object_names): - log.info('Number of files has changed, reprocessing required') - return False - for idx, prev_name in enumerate(prev_names): - if object_names[idx] != prev_name: - log.info('File names have changed, reprocessing required') - return False - - log.info('All files have already been processed') - return True - - -def write_done_file(folder: str, args_str: str, object_names: List[str]): - """Write a file to signify completion. - - This the done file includes the arguments to processing and - a list of objects that were processed. - - Args: - folder (str): Folder to write the done file to - args_str (str): String representation of arguments - object_names (List[str]): List of objects to convert to MDS format - """ - with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: - log.info(f'Writing done file.') - done_file.write('\n'.join([args_str] + object_names) + '\n') - log.info(f'Done file written successfully') - - -def convert_text_to_mds( - tokenizer_name: str, - output_folder: str, - input_folder: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - processes: int, - args_str: str, - reprocess: bool, - trust_remote_code: bool, -): - """Convert a folder of text files to MDS format. - - Args: - tokenizer_name (str): Name of tokenizer to use - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - concat_tokens (int): Concatenate up to this many tokens - eos_text (str): Text to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - processes (int): The number of processes to use. - args_str (str): String representation of the arguments - reprocess (bool): Whether to always reprocess the given folder of text files - trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer - """ - is_remote_output = is_remote_path(output_folder) - log.info(f'Output is remote: {is_remote_output}') - - object_names = get_object_names(input_folder) - if len(object_names) == 0: - log.error(f'No text files found in input folder: {input_folder}') - raise InputFolderMissingDataError(input_folder) - - # Check if the text files in the bucket have already been processed. - if not reprocess and is_already_processed( - output_folder, - args_str, - object_names, - ): - log.info( - f'Input folder {input_folder} is already processed at {output_folder} and ' - + - 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.', - ) - return - - # Use a temporary local directory if the output is remote and there are more than 1 processes - local_output_folder = tempfile.TemporaryDirectory( - ).name if is_remote_output else output_folder - log.info(f'Using local output folder: {local_output_folder}') - - if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: - log.error(f'Output folder is not empty: {output_folder}') - raise OutputFolderNotEmptyError(output_folder) - - if processes > 1: - log.info(f'Using multiprocessing with {processes} processes') - # Download and convert the text files in parallel - args = get_task_args( - object_names, - local_output_folder, - input_folder, - processes, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_and_convert_starargs, args)) - - log.info('Merging MDS shards from each process') - # Merge the mds shards from each of the processes into a single folder - merge_shard_groups(local_output_folder) - else: - log.info('Using single process for download and conversion') - download_and_convert( - object_names, - local_output_folder, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - trust_remote_code, - ) - - # Write a done file with the args and object names - write_done_file(local_output_folder, args_str, object_names) - - if is_remote_output: - # Upload the local output to the remote location - output_object_store = cast( - ObjectStore, - maybe_create_object_store_from_uri(output_folder), - ) - _, _, output_folder_prefix = parse_uri(output_folder) - files_to_upload = os.listdir(local_output_folder) - - for file in files_to_upload: - assert not os.path.isdir(file) - remote_path = os.path.join(output_folder_prefix, file) - output_object_store.upload_object( - remote_path, - os.path.join(local_output_folder, file), - ) - - -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) - - -def _configure_logging(logging_level: str): - """Configure logging. - - Args: - logging_level (str): Logging level. - """ - logging.basicConfig( - format= - f'%(asctime)s: [%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', - ) - logging_level = logging_level.upper() - logging.getLogger('llmfoundry').setLevel(logging_level) - logging.getLogger(__name__).setLevel(logging_level) - log.info(f'Logging level set to {logging_level}') - - if __name__ == '__main__': args = parse_args() - _configure_logging(args.logging_level) - convert_text_to_mds( - tokenizer_name=args.tokenizer, + convert_text_to_mds_from_args( output_folder=args.output_folder, input_folder=args.input_folder, + compression=args.compression, concat_tokens=args.concat_tokens, - eos_text=args.eos_text, + tokenizer_name=args.tokenizer, bos_text=args.bos_text, + eos_text=args.eos_text, + use_tokenizer_eos=args.use_tokenizer_eos, no_wrap=args.no_wrap, - compression=args.compression, processes=args.processes, reprocess=args.reprocess, trust_remote_code=args.trust_remote_code, - args_str=_args_str(args), + logging_level=args.logging_level, ) diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 8dac151f55..f4c160790a 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -13,11 +13,7 @@ from streaming import StreamingDataset from transformers import AutoTokenizer -from llmfoundry.utils.exceptions import ( - InputFolderMissingDataError, - OutputFolderNotEmptyError, -) -from scripts.data_prep.convert_text_to_mds import ( +from llmfoundry.command_utils.data_prep.convert_text_to_mds import ( DONE_FILENAME, convert_text_to_mds, download_and_convert, @@ -25,6 +21,10 @@ merge_shard_groups, write_done_file, ) +from llmfoundry.utils.exceptions import ( + InputFolderMissingDataError, + OutputFolderNotEmptyError, +) class MockObjectStore(): @@ -83,15 +83,15 @@ def _assert_files_exist(prefix: str, files: List[str]): @pytest.mark.parametrize('processes', [1, 2, 3]) @patch.object(ProcessPoolExecutor, 'map', new=Mock(wraps=_mock_map)) @patch( - 'scripts.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri', + 'llmfoundry.command_utils.data_prep.convert_text_to_mds.maybe_create_object_store_from_uri', ) -@patch('scripts.data_prep.convert_text_to_mds.parse_uri') +@patch('llmfoundry.command_utils.data_prep.convert_text_to_mds.parse_uri') @patch( - 'scripts.data_prep.convert_text_to_mds.download_and_convert', + 'llmfoundry.command_utils.data_prep.convert_text_to_mds.download_and_convert', wraps=download_and_convert, ) @patch( - 'scripts.data_prep.convert_text_to_mds.merge_shard_groups', + 'llmfoundry.command_utils.data_prep.convert_text_to_mds.merge_shard_groups', wraps=merge_shard_groups, ) def test_single_and_multi_process( From 006f251f6d2ef5859545728045d7c7de1fd29c59 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 17 Jul 2024 23:08:35 -0700 Subject: [PATCH 11/57] Fix hf dataset hang on small dataset (#1370) --- llmfoundry/data/finetuning/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 0adad8af4e..78bfb9c74c 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -913,6 +913,8 @@ def dataset_mapper(example: Dict): detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8 num_cpus_to_use = max(1, detected_cpus_with_margin) + if len(dataset) < num_cpus_to_use: + num_cpus_to_use = 1 columns_to_remove = list(dataset[0].keys()) tokenized_dataset = dataset.map( From acb55300a5db28f98c0579d52802c7035f58d533 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 17 Jul 2024 23:36:40 -0700 Subject: [PATCH 12/57] Add LoadPlanner and SavePlanner registries (#1358) --- llmfoundry/command_utils/train.py | 27 +++++++++++++++++++++ llmfoundry/registry.py | 39 +++++++++++++++++++++++++++++++ llmfoundry/utils/builders.py | 39 +++++++++++++++++++++++++++++++ tests/test_registry.py | 2 ++ tests/utils/test_builders.py | 35 +++++++++++++++++++++++++++ 5 files changed, 142 insertions(+) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index f49fb28801..feed1e9fb1 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -36,8 +36,10 @@ build_callback, build_composer_model, build_evaluators, + build_load_planner, build_logger, build_optimizer, + build_save_planner, build_scheduler, build_tokenizer, ) @@ -256,6 +258,31 @@ def train(cfg: DictConfig) -> Trainer: # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config + if fsdp_config is not None: + if 'load_planner' in fsdp_config: + load_planners = fsdp_config['load_planner'].items() + if len(load_planners) > 1: + raise ValueError( + 'Only one load planner can be specified in the config.', + ) + load_planner_name, load_planner_config = load_planners[0] + fsdp_config['load_planner'] = build_load_planner( + load_planner_name, + **load_planner_config, + ) + + if 'save_planner' in fsdp_config: + save_planners = fsdp_config['save_planner'].items() + if len(save_planners) > 1: + raise ValueError( + 'Only one save planner can be specified in the config.', + ) + save_planner_name, save_planner_config = save_planners[0] + fsdp_config['save_planner'] = build_save_planner( + save_planner_name, + **save_planner_config, + ) + eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders icl_tasks_config = train_cfg.icl_tasks or train_cfg.icl_tasks_str eval_gauntlet_config = train_cfg.eval_gauntlet or train_cfg.eval_gauntlet_str diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index 50481211ac..e31840d3fb 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -6,6 +6,7 @@ from composer.loggers import LoggerDestination from composer.models import ComposerModel from composer.optim import ComposerScheduler +from torch.distributed.checkpoint import LoadPlanner, SavePlanner from torch.optim import Optimizer from torch.utils.data import DataLoader as TorchDataloader from torch.utils.data import Dataset @@ -339,6 +340,42 @@ description=_config_transforms_description, ) +_load_planners_description = ( + """The load_planners registry is used to register classes that implement the LoadPlanner interface. + + The LoadPlanner will be passed as part of the FSDP config arg of the Trainer. It will be used to load distributed checkpoints. + + Returns: + LoadPlanner: The load planner. + """ +) + +load_planners = create_registry( + 'llmfoundry', + 'load_planners', + generic_type=Type[LoadPlanner], + entry_points=True, + description=_load_planners_description, +) + +_save_planners_description = ( + """The save_planners registry is used to register classes that implement the SavePlanner interface. + + The savePlanner will be passed as part of the FSDP config arg of the Trainer. It will be used to save distributed checkpoints. + + Returns: + SavePlanner: The save planner. + """ +) + +save_planners = create_registry( + 'llmfoundry', + 'save_planners', + generic_type=Type[SavePlanner], + entry_points=True, + description=_save_planners_description, +) + __all__ = [ 'loggers', 'callbacks', @@ -363,4 +400,6 @@ 'fcs', 'icl_datasets', 'config_transforms', + 'load_planners', + 'save_planners', ] diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 012a0b704f..0437736f74 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -27,6 +27,7 @@ from composer.utils import dist from omegaconf import DictConfig from omegaconf import OmegaConf as om +from torch.distributed.checkpoint import LoadPlanner, SavePlanner from torch.optim.optimizer import Optimizer from torchmetrics import Metric from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -187,6 +188,44 @@ def build_icl_data_and_gauntlet( return icl_evaluators, logger_keys, eval_gauntlet_cb +def build_load_planner(name: str, **kwargs: Any) -> LoadPlanner: + """Builds a load planner from the registry. + + Args: + name: Name of the load planner to build. + + Returns: + LoadPlanner: The load planner. + """ + return construct_from_registry( + name=name, + registry=registry.load_planners, + partial_function=True, + pre_validation_function=LoadPlanner, + post_validation_function=None, + kwargs=kwargs, + ) + + +def build_save_planner(name: str, **kwargs: Any) -> SavePlanner: + """Builds a save planner from the registry. + + Args: + name: Name of the save planner to build. + + Returns: + savePlanner: The save planner. + """ + return construct_from_registry( + name=name, + registry=registry.save_planners, + partial_function=True, + pre_validation_function=SavePlanner, + post_validation_function=None, + kwargs=kwargs, + ) + + def build_composer_model( name: str, cfg: Dict[str, Any], diff --git a/tests/test_registry.py b/tests/test_registry.py index 7ee95442c8..aa0c93ee13 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -44,6 +44,8 @@ def test_expected_registries_exist(): 'fcs', 'icl_datasets', 'config_transforms', + 'load_planners', + 'save_planners', } assert existing_registries == expected_registry_names diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index dfcb5b327c..fb6cb0c5df 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -13,17 +13,24 @@ from composer.callbacks import Generate from composer.core import Evaluator from composer.loggers import WandBLogger +from torch.distributed.checkpoint.default_planner import ( + DefaultLoadPlanner, + DefaultSavePlanner, +) from transformers import PreTrainedTokenizerBase from llmfoundry.callbacks import HuggingFaceCheckpointer +from llmfoundry.registry import load_planners, save_planners from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper from llmfoundry.utils.builders import ( add_metrics_to_eval_loaders, build_callback, build_eval_loaders, build_evaluators, + build_load_planner, build_logger, build_optimizer, + build_save_planner, build_tokenizer, ) @@ -345,6 +352,34 @@ def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): assert eval_loaders2[1].metric_names == [] +def test_build_load_planner(): + # Dummy LoadPlanner for testing + class DummyLoadPlanner(DefaultLoadPlanner): + + def __init__(self, is_test: bool): + self.is_test = is_test + + load_planners.register('dummy', func=DummyLoadPlanner) + load_planner = build_load_planner('dummy', is_test=True) + + assert isinstance(load_planner, DummyLoadPlanner) + assert load_planner.is_test is True + + +def test_build_save_planner(): + # Dummy SavePlanner for testing + class DummySavePlanner(DefaultSavePlanner): + + def __init__(self, is_test: bool): + self.is_test = is_test + + save_planners.register('dummy', func=DummySavePlanner) + save_planner = build_save_planner('dummy', is_test=True) + + assert isinstance(save_planner, DummySavePlanner) + assert save_planner.is_test is True + + def test_add_metrics_to_eval_loaders(): evaluators = [ Evaluator( From 900c6a7f378fea3fb5541d95dc8f32dee294a90c Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:59:27 -0700 Subject: [PATCH 13/57] rank 0 first (#1371) --- llmfoundry/models/hf/hf_causal_lm.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 536cd0257d..e3fa8d03a3 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -216,6 +216,22 @@ def build_inner_model( + 'Please `pip install llm-foundry[gpu]`.', ) + # Hugging Face copies the modules into the + # transformers modules cache. On particular systems, this operation seems to cause contention between + # the different processes. To avoid this contention, we first create the config on local rank + # zero. This will set up the transformers module cache and avoid the future contention. + if dist.get_local_rank() == 0: + AutoConfig.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + use_auth_token=use_auth_token, + attn_implementation=requested_attention_implementation, + use_cache= + False, # Necessary due to https://github.com/huggingface/transformers/issues/28056 + ) + + dist.barrier() + # Construct the Hugging Face config to use config = AutoConfig.from_pretrained( pretrained_model_name_or_path, From 59f1a0af8ec578f870bc9d9cdd5ef14a8f8e7ec7 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jul 2024 20:29:19 -0700 Subject: [PATCH 14/57] Add convert_finetuning_dataset to CLI (#1354) * convert finetuning dataset * cli * typo * typo * commit comments * type * precommit * help * typo * delete old * rerun * rerun * more db imports * typo * yapf --------- Co-authored-by: v-chen_data --- llmfoundry/cli/data_prep_cli.py | 92 +++++ llmfoundry/command_utils/__init__.py | 6 + .../data_prep/convert_finetuning_dataset.py | 346 ++++++++++++++++++ .../data_prep/convert_finetuning_dataset.py | 267 ++------------ tests/data/test_dataloader.py | 4 +- 5 files changed, 467 insertions(+), 248 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index 3ca53f4104..b2a4af4521 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -9,6 +9,7 @@ from llmfoundry.command_utils import ( convert_dataset_hf_from_args, convert_dataset_json_from_args, + convert_finetuning_dataset_from_args, convert_text_to_mds_from_args, ) @@ -106,6 +107,97 @@ def convert_dataset_json( ) +@app.command(name='convert_finetuning_dataset') +def convert_finetuning_dataset_cli( + dataset: Annotated[ + str, + Option( + ..., + help= + 'Name of the dataset (e.g., first argument to `datasets.load_dataset`, for jsonl data format, it is `json`).', + )], + data_subset: Annotated[ + Optional[str], + Option(help='(Optional) subset of data to use.',)] = None, + splits: Annotated[str, + Option(help='Comma-separated list of dataset splits'), + ] = 'train,validation', + preprocessor: Annotated[ + Optional[str], + Option( + help= + 'Name or import path of function used to preprocess (reformat) the dataset.', + )] = None, + data_files: Annotated[ + str, Option(help='Data file for each split. Comma-separated.')] = '', + skip_preprocessing: Annotated[ + bool, Option(help='Whether to skip preprocessing.')] = False, + out_root: Annotated[ + str, + Option( + ..., + help= + 'Root path of output directory where MDS shards will be stored. Can be a remote URI.', + )] = '', + local: Annotated[ + Optional[str], + Option( + help= + '(Optional) root path of local directory if you want to keep a local copy when out_root is remote.', + )] = None, + compression: Annotated[ + Optional[str], + Option(help='(Optional) name of compression algorithm to use.')] = None, + num_workers: Annotated[Optional[int], + Option(help='Number of workers.')] = None, + tokenizer: Annotated[Optional[str], + Option(help='Tokenizer used for processing.')] = None, + tokenizer_kwargs: Annotated[ + Optional[str], + Option( + help= + 'Keyword arguments for tokenizer initialization in JSON format.', + )] = None, + max_seq_len: Annotated[int, Option(help='Maximum sequence length.')] = 2048, + target_prompts: Annotated[ + str, + Option(help='Policy for when to use prompts as training targets.'), + ] = 'none', + target_responses: Annotated[ + str, + Option(help='Policy for which responses to treat as training targets.'), + ] = 'last', + encoder_decoder: Annotated[ + bool, + Option( + help= + 'Set if the data are intended to be used to train an encoder-decoder model.', + )] = False, +): + """Convert a Finetuning Dataset to MDS streaming format.""" + # Convert comma-separated args + splits_list = splits.split(',') if splits else [] + data_files_list = data_files.split(',') if data_files else [] + convert_finetuning_dataset_from_args( + dataset=dataset, + data_subset=data_subset, + splits=splits_list, + preprocessor=preprocessor, + data_files=data_files_list, + skip_preprocessing=skip_preprocessing, + out_root=out_root, + local=local, + compression=compression, + num_workers=num_workers, + tokenizer=tokenizer, + tokenizer_kwargs=tokenizer_kwargs, + max_seq_len=max_seq_len, + target_prompts=target_prompts, + target_responses=target_responses, + encoder_decoder=encoder_decoder, + ) + + @app.command(name='convert_text_to_mds') def convert_text_to_mds( output_folder: Annotated[str, Option(..., help='The folder to write output to')], diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 995c5345e7..3a99f09f86 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -8,6 +8,10 @@ convert_dataset_json, convert_dataset_json_from_args, ) +from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import ( + convert_finetuning_dataset, + convert_finetuning_dataset_from_args, +) from llmfoundry.command_utils.data_prep.convert_text_to_mds import ( convert_text_to_mds, convert_text_to_mds_from_args, @@ -36,6 +40,8 @@ 'convert_dataset_hf_from_args', 'convert_dataset_json', 'convert_dataset_json_from_args', + 'convert_finetuning_dataset_from_args', + 'convert_finetuning_dataset', 'convert_text_to_mds', 'convert_text_to_mds_from_args', ] diff --git a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py new file mode 100644 index 0000000000..94cd79815b --- /dev/null +++ b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py @@ -0,0 +1,346 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import platform +import warnings +from typing import Any, Callable, Dict, Iterable, Optional, Union + +import datasets as hf_datasets +import psutil +from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict +from streaming import MDSWriter +from torch.utils.data import DataLoader +from tqdm import tqdm + +from llmfoundry.data.finetuning.collator import validate_target_settings +from llmfoundry.data.finetuning.tasks import ( + _get_example_type, + dataset_constructor, + is_valid_ift_example, + tokenize_formatted_example, +) +from llmfoundry.utils.builders import build_tokenizer + +HFDataset = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset] + + +def build_dataloader( + dataset: HFDataset, + batch_size: int, + num_workers: Optional[int] = None, +) -> DataLoader: + if num_workers is None: + # Multiple workers is only supported on linux machines + if 'linux' in platform.platform().lower(): + num_workers = max(1, psutil.cpu_count()) + else: + num_workers = 0 + + # If using multiple workers, configure each worker to prefetch as many samples as it can, up to + # the aggregate device batch size + # If not using workers, the torch DataLoader expects the default value for prefetch_factor, + # which non-intuitively must be 2. + # If on macOS, PyTorch requires prefetch_factor set to None since num_workers is always zero + if 'macos' in platform.platform().lower() and num_workers == 0: + prefetch_factor = None + else: + prefetch_factor = max( + 1, + 2 * batch_size // num_workers, + ) if num_workers > 0 else 2 + + return DataLoader( + dataset=dataset, + sampler=None, + batch_size=batch_size, + num_workers=num_workers, + prefetch_factor=prefetch_factor, + ) + + +def generate_samples( + loader: DataLoader, + truncate_num_samples: Optional[int] = None, +) -> Iterable[Dict[str, bytes]]: + """Generator over samples of a dataloader. + + Args: + loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} + truncate_num_samples (Optional[int]): An optional # of samples to stop at. + + Yields: + Sample dicts. + """ + n_samples = 0 + for batch in loader: + keys = list(batch.keys()) + current_bs = len(batch[keys[0]]) + for idx in range(current_bs): + if truncate_num_samples is not None and n_samples == truncate_num_samples: + return + n_samples += 1 + yield {k: v[idx] for k, v in batch.items()} + + +def get_columns_and_format( + dataset: HFDataset, + tokenizing: bool, + preprocessing_fn: Callable, +): + ex = preprocessing_fn(next(iter(dataset))) + example_type = _get_example_type(ex) + if tokenizing: + return {'turns': 'json'}, example_type + if example_type == 'chat': + # Chat format + return {'messages': 'json'}, example_type + else: + # Prompt-response format + return {'prompt': 'str', 'response': 'str'}, example_type + + +def convert_finetuning_dataset( + dataset: str, + data_subset: Optional[str], + splits: list[str], + preprocessor: Optional[str], + data_files: list[str], + skip_preprocessing: bool, + out_root: str, + local: Optional[str], + compression: Optional[str], + num_workers: Optional[int], + tokenizer: Optional[str], + tokenizer_kwargs: dict[str, Any], + max_seq_len: int, + target_prompts: str, + target_responses: str, + encoder_decoder: bool, +) -> None: + """Converts Finetuning datasets to MDS format. + + Args: + dataset (str): Name of the dataset (e.g., first argument to `datasets.load_dataset`, for jsonl data format, it is `json`). + data_subset (Optional[str]): Subset of data to use. + splits (list[str]): Comma-separated list of dataset splits + preprocessor (Optional[str]): Name or import path of function used to preprocess (reformat) the dataset. + data_files (list[str]): Data file for each split. Comma-separated. + skip_preprocessing (bool): Whether to skip preprocessing. + out_root (str): Root path of output directory where MDS shards will be stored. Can be a remote URI. + local (Optional[str]): Root path of local directory if you want to keep a local copy when out_root is remote. + compression (Optional[str]): Name of compression algorithm to use. + num_workers (Optional[int]): Number of workers. + tokenizer (Optional[str]): Tokenizer used for processing. + tokenizer_kwargs (dict[str, Any]): Keyword arguments for tokenizer initialization. + max_seq_len (int): Maximum sequence length. + target_prompts (str): Policy for when to use prompts as training targets. + target_responses (str): Policy for which responses to treat as training targets. + encoder_decoder (bool): Set if the data are intended to be used to train an encoder-decoder model + + Raises: + ValueError: If the target settings are invalid. + """ + if skip_preprocessing: + preprocessing_fn = lambda x: x # Just an identity function + else: + preprocessor_str = preprocessor + preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_str( + preprocessor=preprocessor_str, + dataset_name=dataset, + ) + if preprocessing_fn is None: + raise ValueError( + '`preprocessor` was not set and no preprocessing function ' +\ + 'has been registered for `dataset`. If this was intentional ' +\ + '(e.g., because your dataset is already correctly formatted), ' +\ + 'include the "--skip-preprocessing" flag to avoid this error.', + ) + + # Make sure the target settings are valid + validate_target_settings( + target_prompts=target_prompts, + target_responses=target_responses, + decoder_only_format=not encoder_decoder, + ) + + tokenizer = None + tokenizer_kwargs = tokenizer_kwargs + tokenizer_kwargs.update({'model_max_length': max_seq_len}) + if tokenizer: + tokenizer = build_tokenizer(tokenizer, tokenizer_kwargs) + + for i, split_name in enumerate(splits): + data_file = None + if len(data_files) > 0: + data_file = data_files[i] + loaded_dataset = hf_datasets.load_dataset( + path=dataset, + name=data_subset, + split=split_name, + data_files=data_file, + streaming=True, + ) + # Determine the output columns + columns, example_type = get_columns_and_format( + dataset=loaded_dataset, + tokenizing=tokenizer is not None, + preprocessing_fn=preprocessing_fn, + ) + # Prepare the iterables + if example_type == 'chat': + samples = iter(loaded_dataset) + else: + loader = build_dataloader( + dataset=loaded_dataset, + batch_size=512, + num_workers=num_workers, + ) + samples = generate_samples(loader) + + # Write samples + print(f'Converting {split_name} to MDS format...') + out = os.path.join(out_root, split_name) + if local is not None: + out = (os.path.join(local, split_name), out) + keep_local = True + else: + keep_local = False + with MDSWriter( + columns=columns, + out=out, + compression=compression, + keep_local=keep_local, + ) as out: + examples_removed = 0 + for sample in tqdm(samples, desc=split_name): + formatted_sample = preprocessing_fn(sample) + assert isinstance(formatted_sample, dict) + + # Use the _get_example_type utility to confirm that the formatted sample + # can be interpreted by the tokenization code + try: + example_type = _get_example_type(formatted_sample) + except Exception as e: + raise ValueError( + 'Encountered an error when checking example for proper formatting. ' +\ + f'example={formatted_sample}', + ) from e + if tokenizer is not None: + sample = tokenize_formatted_example( + formatted_sample, + tokenizer=tokenizer, + ) + if not is_valid_ift_example( + max_seq_len, + target_prompts=target_prompts, + target_responses=target_responses, + decoder_only_format=not encoder_decoder, + example=sample, + ): + examples_removed += 1 + continue + + sample_to_write = {'turns': []} + for turn in sample['turns']: + turn_to_write = {} + for key in ['input_ids', 'labels']: + turn_to_write[key] = list(turn[key]) + sample_to_write['turns'].append(turn_to_write) + out.write(sample_to_write) + else: + if example_type == 'prompt_response': + encoded_sample = {} + for key in ['prompt', 'response']: + value = formatted_sample[key] + assert isinstance(value, str) + encoded_sample[key] = value.encode('utf-8') + out.write(encoded_sample) + else: + out.write(formatted_sample) + + if tokenizer is not None and examples_removed > 0: + warnings.warn( + f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}, ' + + + 'the prompt or response was empty, or the response was all padding tokens.', + ) + + +def convert_finetuning_dataset_from_args( + dataset: str, + data_subset: Optional[str], + splits: list[str], + preprocessor: Optional[str], + data_files: list[str], + skip_preprocessing: bool, + out_root: str, + local: Optional[str], + compression: Optional[str], + num_workers: Optional[int], + tokenizer: Optional[str], + tokenizer_kwargs: Optional[str], + max_seq_len: int, + target_prompts: str, + target_responses: str, + encoder_decoder: bool, +): + """A wrapper for `convert_finetuning_dataset` to parse arguments. + + Args: + dataset (str): Name of the dataset (e.g., first argument to `datasets.load_dataset`, for jsonl data format, it is `json`). + data_subset (Optional[str]): Subset of data to use. + splits (list[str]): Comma-separated list of dataset splits + preprocessor (Optional[str]): Name or import path of function used to preprocess (reformat) the dataset. + data_files (list[str]): Data file for each split. Comma-separated. + skip_preprocessing (bool): Whether to skip preprocessing. + out_root (str): Root path of output directory where MDS shards will be stored. Can be a remote URI. + local (Optional[str]): Root path of local directory if you want to keep a local copy when out_root is remote. + compression (Optional[str]): Name of compression algorithm to use. + num_workers (Optional[int]): Number of workers. + tokenizer (Optional[str]): Tokenizer used for processing. + tokenizer_kwargs (Optional[str]): Keyword arguments for tokenizer initialization in JSON format. + max_seq_len (int): Maximum sequence length. + target_prompts (str): Policy for when to use prompts as training targets. + target_responses (str): Policy for which responses to treat as training targets. + encoder_decoder (bool): Set if the data are intended to be used to train an encoder-decoder model. + + Raises: + ValueError: If the target settings are invalid. + ValueError: If the output directory already contains the requested splits. + """ + if os.path.isdir(out_root) and len( + set(os.listdir(out_root)).intersection(set(splits)), + ) > 0: + raise ValueError( + f'--out_root={out_root} contains {os.listdir(out_root)} which cannot overlap with the requested splits {splits}.', + ) + + if tokenizer_kwargs is not None: + parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs) + else: + parsed_tokenizer_kwargs = {} + + if len(data_files) > 0 and len(data_files,) != len(splits): + raise ValueError( + f'If data_files is set, data_files and splits must have the same length. Got {len(data_files)=} while {len(splits)=}', + ) + convert_finetuning_dataset( + dataset=dataset, + data_subset=data_subset, + splits=splits, + preprocessor=preprocessor, + data_files=data_files, + skip_preprocessing=skip_preprocessing, + out_root=out_root, + local=local, + compression=compression, + num_workers=num_workers, + tokenizer=tokenizer, + tokenizer_kwargs=parsed_tokenizer_kwargs, + max_seq_len=max_seq_len, + target_prompts=target_prompts, + target_responses=target_responses, + encoder_decoder=encoder_decoder, + ) diff --git a/scripts/data_prep/convert_finetuning_dataset.py b/scripts/data_prep/convert_finetuning_dataset.py index 523d45093d..b28e25786b 100644 --- a/scripts/data_prep/convert_finetuning_dataset.py +++ b/scripts/data_prep/convert_finetuning_dataset.py @@ -1,28 +1,12 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import json -import os -import platform -import warnings from argparse import ArgumentParser, Namespace -from typing import Callable, Dict, Iterable, Optional, Union +from typing import Union -import datasets as hf_datasets -import psutil from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict -from streaming import MDSWriter -from torch.utils.data import DataLoader -from tqdm import tqdm -from llmfoundry.data.finetuning.collator import validate_target_settings -from llmfoundry.data.finetuning.tasks import ( - _get_example_type, - dataset_constructor, - is_valid_ift_example, - tokenize_formatted_example, -) -from llmfoundry.utils.builders import build_tokenizer +from llmfoundry.command_utils import convert_finetuning_dataset_from_args HFDataset = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset] @@ -116,236 +100,9 @@ def parse_args() -> Namespace: ) parsed = parser.parse_args() - - if os.path.isdir(parsed.out_root) and len( - set(os.listdir(parsed.out_root)).intersection(set(parsed.splits)), - ) > 0: - raise ValueError( - f'--out_root={parsed.out_root} contains {os.listdir(parsed.out_root)} which cannot overlap with the requested splits {parsed.splits}.', - ) - - if parsed.tokenizer_kwargs is not None: - parsed.tokenizer_kwargs = json.loads(parsed.tokenizer_kwargs) - else: - parsed.tokenizer_kwargs = {} - - if len(parsed.data_files) > 0 and len( - parsed.data_files, - ) != len(parsed.splits): - raise ValueError( - f'If data_files is set, data_files and splits must have the same length. Got {len(parsed.data_files)=} while {len(parsed.splits)=}', - ) - return parsed -def build_dataloader( - dataset: HFDataset, - batch_size: int, - num_workers: Optional[int] = None, -) -> DataLoader: - if num_workers is None: - # Multiple workers is only supported on linux machines - if 'linux' in platform.platform().lower(): - num_workers = max(1, psutil.cpu_count()) - else: - num_workers = 0 - - # If using multiple workers, configure each worker to prefetch as many samples as it can, up to - # the aggregate device batch size - # If not using workers, the torch DataLoader expects the default value for prefetch_factor, - # which non-intuitively must be 2. - # If on macOS, PyTorch requires prefetch_factor set to None since num_workers is always zero - if 'macos' in platform.platform().lower() and num_workers == 0: - prefetch_factor = None - else: - prefetch_factor = max( - 1, - 2 * batch_size // num_workers, - ) if num_workers > 0 else 2 - - return DataLoader( - dataset=dataset, - sampler=None, - batch_size=batch_size, - num_workers=num_workers, - prefetch_factor=prefetch_factor, - ) - - -def generate_samples( - loader: DataLoader, - truncate_num_samples: Optional[int] = None, -) -> Iterable[Dict[str, bytes]]: - """Generator over samples of a dataloader. - - Args: - loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} - truncate_num_samples (Optional[int]): An optional # of samples to stop at. - - Yields: - Sample dicts. - """ - n_samples = 0 - for batch in loader: - keys = list(batch.keys()) - current_bs = len(batch[keys[0]]) - for idx in range(current_bs): - if truncate_num_samples is not None and n_samples == truncate_num_samples: - return - n_samples += 1 - yield {k: v[idx] for k, v in batch.items()} - - -def get_columns_and_format( - dataset: HFDataset, - tokenizing: bool, - preprocessing_fn: Callable, -): - ex = preprocessing_fn(next(iter(dataset))) - example_type = _get_example_type(ex) - if tokenizing: - return {'turns': 'json'}, example_type - if example_type == 'chat': - # Chat format - return {'messages': 'json'}, example_type - else: - # Prompt-response format - return {'prompt': 'str', 'response': 'str'}, example_type - - -def main(args: Namespace) -> None: - """Main: create a streaming dataset. - - Args: - args (Namespace): Commandline arguments. - """ - if args.skip_preprocessing: - preprocessing_fn = lambda x: x # Just an identity function - else: - preprocessor_str = args.preprocessor - preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_str( - preprocessor=preprocessor_str, - dataset_name=args.dataset, - ) - if preprocessing_fn is None: - raise ValueError( - '`args.preprocessor` was not set and no preprocessing function ' +\ - 'has been registered for `args.dataset`. If this was intentional ' +\ - '(e.g., because your dataset is already correctly formatted), ' +\ - 'include the "--skip-preprocessing" flag to avoid this error.', - ) - - # Make sure the target settings are valid - validate_target_settings( - target_prompts=args.target_prompts, - target_responses=args.target_responses, - decoder_only_format=not args.encoder_decoder, - ) - - tokenizer = None - tokenizer_kwargs = args.tokenizer_kwargs - tokenizer_kwargs.update({'model_max_length': args.max_seq_len}) - if args.tokenizer: - tokenizer = build_tokenizer(args.tokenizer, tokenizer_kwargs) - - for i, split_name in enumerate(args.splits): - data_file = None - if len(args.data_files) > 0: - data_file = args.data_files[i] - dataset = hf_datasets.load_dataset( - path=args.dataset, - name=args.data_subset, - split=split_name, - data_files=data_file, - streaming=True, - ) - # Determine the output columns - columns, example_type = get_columns_and_format( - dataset=dataset, - tokenizing=tokenizer is not None, - preprocessing_fn=preprocessing_fn, - ) - # Prepare the iterables - if example_type == 'chat': - samples = iter(dataset) - else: - loader = build_dataloader( - dataset=dataset, - batch_size=512, - num_workers=args.num_workers, - ) - samples = generate_samples(loader) - - # Write samples - print(f'Converting {split_name} to MDS format...') - out = os.path.join(args.out_root, split_name) - if args.local is not None: - out = (os.path.join(args.local, split_name), out) - keep_local = True - else: - keep_local = False - with MDSWriter( - columns=columns, - out=out, - compression=args.compression, - keep_local=keep_local, - ) as out: - examples_removed = 0 - for sample in tqdm(samples, desc=split_name): - formatted_sample = preprocessing_fn(sample) - assert isinstance(formatted_sample, dict) - - # Use the _get_example_type utility to confirm that the formatted sample - # can be interpreted by the tokenization code - try: - example_type = _get_example_type(formatted_sample) - except Exception as e: - raise ValueError( - 'Encountered an error when checking example for proper formatting. ' +\ - f'example={formatted_sample}', - ) from e - if tokenizer is not None: - sample = tokenize_formatted_example( - formatted_sample, - tokenizer=tokenizer, - ) - if not is_valid_ift_example( - args.max_seq_len, - target_prompts=args.target_prompts, - target_responses=args.target_responses, - decoder_only_format=not args.encoder_decoder, - example=sample, - ): - examples_removed += 1 - continue - - sample_to_write = {'turns': []} - for turn in sample['turns']: - turn_to_write = {} - for key in ['input_ids', 'labels']: - turn_to_write[key] = list(turn[key]) - sample_to_write['turns'].append(turn_to_write) - out.write(sample_to_write) - else: - if example_type == 'prompt_response': - encoded_sample = {} - for key in ['prompt', 'response']: - value = formatted_sample[key] - assert isinstance(value, str) - encoded_sample[key] = value.encode('utf-8') - out.write(encoded_sample) - else: - out.write(formatted_sample) - - if tokenizer is not None and examples_removed > 0: - warnings.warn( - f'Dropped {examples_removed} examples where the prompt was longer than {args.max_seq_len}, ' - + - 'the prompt or response was empty, or the response was all padding tokens.', - ) - - if __name__ == '__main__': """Example for converting Muennighoff/P3: @@ -355,4 +112,22 @@ def main(args: Namespace) -> None: >>> --preprocessor llmfoundry.data.finetuning.tasks:p3_preprocessing_function \ >>> --out_root s3:///muennighoff-p3 """ - main(parse_args()) + args = parse_args() + convert_finetuning_dataset_from_args( + dataset=args.dataset, + data_subset=args.data_subset, + splits=args.splits, + preprocessor=args.preprocessor, + data_files=args.data_files, + skip_preprocessing=args.skip_preprocessing, + out_root=args.out_root, + local=args.local, + compression=args.compression, + num_workers=args.num_workers, + tokenizer=args.tokenizer, + tokenizer_kwargs=args.tokenizer_kwargs, + max_seq_len=args.max_seq_len, + target_prompts=args.target_prompts, + target_responses=args.target_responses, + encoder_decoder=args.encoder_decoder, + ) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 21d73c0d34..8e92658194 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -22,6 +22,8 @@ from streaming.base.util import clean_stale_shared_memory from llmfoundry.command_utils import convert_dataset_hf +from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import \ + get_columns_and_format from llmfoundry.data import build_dataloader, build_finetuning_dataloader from llmfoundry.data.finetuning.collator import ( _HF_IGNORE_INDEX, @@ -55,8 +57,6 @@ NotEnoughDatasetSamplesError, UnknownExampleTypeError, ) -# yapf: enable -from scripts.data_prep.convert_finetuning_dataset import get_columns_and_format from tests.data_utils import ( make_tiny_conversation_ft_dataset, make_tiny_ft_dataset, From feb786cd274bd048e96e6d13f46e46fd6d005d7d Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Fri, 19 Jul 2024 22:13:26 -0700 Subject: [PATCH 15/57] Allow for transforms on the model before MLFlow registration (#1372) * pre register transform * meta * yo * yo * yo * yo * log * log * yo * ay * yo * nice * test * test * Update llmfoundry/callbacks/hf_checkpointer.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * Update llmfoundry/callbacks/hf_checkpointer.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/callbacks/hf_checkpointer.py | 173 ++++++++++-------- .../inference/test_convert_composer_to_hf.py | 5 + 2 files changed, 98 insertions(+), 80 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 4de7f9f2c6..2ade458bb4 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -18,7 +18,6 @@ import torch import torch.nn as nn from composer.core import Callback, Event, Precision, State, Time, TimeUnit -from composer.core.state import fsdp_state_dict_type_context from composer.loggers import Logger, MLFlowLogger from composer.models import HuggingFaceModel from composer.utils import ( @@ -29,7 +28,12 @@ ) from composer.utils.misc import create_interval_scheduler from mlflow.transformers import _fetch_model_card, _write_license_information -from packaging import version +from torch.distributed._tensor import DTensor +from torch.distributed.checkpoint.state_dict import ( + StateDictOptions, + get_model_state_dict, +) +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from transformers import ( PretrainedConfig, PreTrainedModel, @@ -179,6 +183,7 @@ def __init__( 'bfloat16': torch.bfloat16, }[precision] self.flatten_imports = flatten_imports + self.using_peft = False # mlflow config setup self.mlflow_registered_model_name = mlflow_registered_model_name @@ -274,6 +279,15 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( '1GB', ) + + # Check if the model is using PEFT + if state.is_model_ddp: + composer_model = state.model.module + elif isinstance(state.model.model, FSDP): + composer_model = state.model + else: + composer_model = state.model + self.using_peft = composer_model.using_peft elif event == Event.FIT_END: # Wait for all child processes spawned by the callback to finish. timeout = 3600 @@ -362,6 +376,23 @@ def transform_config( copied_config.ffn_config['moe_world_size'] = 1 return copied_config + def transform_model_pre_registration( + self, + model: PreTrainedModel, + ) -> PreTrainedModel: + """Transform the model before registering with MLflow. + + This allows a subclass to modify the model before registering with MLflow. The base class implementation will + make no modifications. + + Args: + model (PreTrainedModel): The model to be transformed. + + Returns: + PreTrainedModel: The transformed model. + """ + return model + def _save_checkpoint(self, state: State, logger: Logger): del logger # unused @@ -388,82 +419,62 @@ def _save_checkpoint(self, state: State, logger: Logger): temp_save_dir = tempfile.mkdtemp() if use_temp_dir else save_dir log.debug('Gathering state dict') - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP if state.is_model_ddp: - composer_model = state.model.module original_model: PreTrainedModel = state.model.module.model state_dict_model = state.model.module.model original_tokenizer = state.model.module.tokenizer elif isinstance(state.model.model, FSDP): - composer_model = state.model original_model: PreTrainedModel = state.model.model.module state_dict_model = state.model.model original_tokenizer = state.model.tokenizer else: - composer_model = state.model original_model: PreTrainedModel = state.model.model state_dict_model = state.model.model original_tokenizer = state.model.tokenizer - if version.parse(torch.__version__) > version.parse('2.2.9'): - from torch.distributed._tensor import DTensor - from torch.distributed.checkpoint.state_dict import ( - StateDictOptions, - get_model_state_dict, - ) - cpu_offload = True - - # Add a dtensor->cpu tensor hook to avoid CUDA OOM - def dtensor_to_tensor_hook( - module: nn.Module, - state_dict: Dict[str, Any], - prefix: str, - *args: Any, - ) -> Dict[str, Any]: - dtensor_fqns = [] - for fqn in state_dict.keys(): - tensor = state_dict[fqn] - if isinstance(tensor, DTensor): - dtensor_fqns.append(fqn) - tensor = tensor.full_tensor() # type: ignore - if dist.get_global_rank() == 0: - if cpu_offload: - tensor = tensor.cpu() - state_dict[fqn] = tensor - if dist.get_global_rank() != 0: - for fqn in dtensor_fqns: - del state_dict[fqn] - return state_dict - - hooks = [] - for _, module in state_dict_model.named_modules(): - if isinstance(module, FSDP): - hooks.append( - module. - _register_state_dict_hook(dtensor_to_tensor_hook), - ) + cpu_offload = True + + # Add a dtensor->cpu tensor hook to avoid CUDA OOM + def dtensor_to_tensor_hook( + module: nn.Module, + state_dict: Dict[str, Any], + prefix: str, + *args: Any, + ) -> Dict[str, Any]: + dtensor_fqns = [] + for fqn in state_dict.keys(): + tensor = state_dict[fqn] + if isinstance(tensor, DTensor): + dtensor_fqns.append(fqn) + tensor = tensor.full_tensor() # type: ignore + if dist.get_global_rank() == 0: + if cpu_offload: + tensor = tensor.cpu() + state_dict[fqn] = tensor + if dist.get_global_rank() != 0: + for fqn in dtensor_fqns: + del state_dict[fqn] + return state_dict + + hooks = [] + for _, module in state_dict_model.named_modules(): + if isinstance(module, FSDP): + hooks.append( + module._register_state_dict_hook(dtensor_to_tensor_hook), + ) - state_dict = get_model_state_dict( - state_dict_model, - options=StateDictOptions( - full_state_dict=True, - cpu_offload=cpu_offload, - ), - ) - for hook in hooks: - hook.remove() - else: - state_dict_context = fsdp_state_dict_type_context( - original_model, - state_dict_type='full', - ) if ((not state.is_model_ddp) and - isinstance(state_dict_model, - FSDP)) else contextlib.nullcontext() - with state_dict_context: - state_dict = state_dict_model.state_dict() - - # Convert the state dict to the requested precis + state_dict = get_model_state_dict( + state_dict_model, + options=StateDictOptions( + full_state_dict=True, + cpu_offload=cpu_offload, + ), + ) + for hook in hooks: + hook.remove() + + # Convert the state dict to the requested precision for k, v in state_dict.items(): if isinstance(v, torch.Tensor): state_dict[k] = v.to(dtype=self.dtype) @@ -480,22 +491,19 @@ def dtensor_to_tensor_hook( log.debug(f'Creating new model instance') - if composer_model.using_peft: - # We don't use meta here because the state dict does not contain the full - # model, only the adapter weights. - active_adapter = original_model.active_adapter - base_model = original_model.get_base_model() - new_base_model_instance = type(base_model)(new_config) - - new_model_instance = type(original_model)( - new_base_model_instance, - original_model.peft_config[active_adapter], - ) - new_model_instance.to(dtype=self.dtype) - else: - # First create the model instance on meta device to avoid the - # initialization cost. - with init_empty_weights(): + # First create the model instance on meta device to avoid the + # initialization cost. + with init_empty_weights(): + if self.using_peft: + active_adapter = original_model.active_adapter + base_model = original_model.get_base_model() + new_base_model_instance = type(base_model)(new_config) + + new_model_instance = type(original_model)( + new_base_model_instance, + original_model.peft_config[active_adapter], + ) + else: new_model_instance = type(original_model)(new_config) new_model_instance.generation_config.update( **original_model.generation_config.to_dict(), @@ -556,6 +564,11 @@ def dtensor_to_tensor_hook( if dist.get_global_rank() == 0: if self.mlflow_registered_model_name and self._is_last_batch(state): + + new_model_instance = self.transform_model_pre_registration( + new_model_instance, + ) + components = {'model': new_model_instance} if original_tokenizer is not None: components['tokenizer'] = original_tokenizer @@ -575,7 +588,7 @@ def dtensor_to_tensor_hook( model_saving_kwargs: Dict[str, Any] = { 'path': local_save_path, } - if composer_model.using_peft: + if self.using_peft: model_saving_kwargs['flavor'] = 'peft' model_saving_kwargs['save_pretrained_dir' ] = temp_save_dir diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 2ef458fece..68dc855154 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -383,6 +383,9 @@ def test_huggingface_conversion_callback_interval( mlflow_logger_mock.model_registry_prefix = '' mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' mlflow_logger_mock._run_id = 'mlflow-run-id' + checkpointer_callback.transform_model_pre_registration = MagicMock( + wraps=checkpointer_callback.transform_model_pre_registration, + ) trainer = Trainer( model=original_model, device='gpu', @@ -407,8 +410,10 @@ def test_huggingface_conversion_callback_interval( input_example=ANY, metadata={}, ) + assert checkpointer_callback.transform_model_pre_registration.call_count == 1 assert mlflow_logger_mock.register_model_with_run_id.call_count == 1 else: + assert checkpointer_callback.transform_model_pre_registration.call_count == 0 assert mlflow_logger_mock.save_model.call_count == 0 assert mlflow_logger_mock.register_model_with_run_id.call_count == 0 From d50b0dbfb157c34afcd71737a3e50888b46fba2a Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sat, 20 Jul 2024 16:02:47 -0700 Subject: [PATCH 16/57] Allow flash attention up to 3 (#1377) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 309d7d3372..eb22d8f6a3 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ # Flash 2 group kept for backwards compatibility extra_deps['gpu-flash2'] = [ - 'flash-attn==2.5.8', + 'flash-attn>=2.5.8,<3', ] extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) From 485719593b657276a06b83c31b390e2ba3be4cc0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 21 Jul 2024 01:33:17 -0400 Subject: [PATCH 17/57] Update accelerate requirement from <0.26,>=0.25 to >=0.32.1,<0.33 (#1341) Updates the requirements on [accelerate](https://github.com/huggingface/accelerate) to permit the latest version. - [Release notes](https://github.com/huggingface/accelerate/releases) - [Commits](https://github.com/huggingface/accelerate/compare/v0.25.0...v0.32.1) --- updated-dependencies: - dependency-name: accelerate dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eb22d8f6a3..b4948e9a43 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ install_requires = [ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24', 'mlflow>=2.14.1,<2.15', - 'accelerate>=0.25,<0.26', # for HF inference `device_map` + 'accelerate>=0.25,<0.33', # for HF inference `device_map` 'transformers>=4.42.3,<4.43', 'mosaicml-streaming>=0.7.6,<0.8', 'torch>=2.3.0,<2.4', From a7b4056a17fb8ce3e484c888c55428b27e92816b Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Sun, 21 Jul 2024 17:38:18 -0500 Subject: [PATCH 18/57] update runners (#1360) --- .github/workflows/code-quality.yaml | 2 +- .github/workflows/coverage.yaml | 2 +- .github/workflows/release.yaml | 2 +- .github/workflows/smoketest.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index ef85332047..062aa41bf4 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -19,7 +19,7 @@ defaults: working-directory: . jobs: code-quality: - runs-on: ubuntu-20.04 + runs-on: linux-ubuntu-latest timeout-minutes: 30 strategy: matrix: diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml index fc511d7e60..cf3581f716 100644 --- a/.github/workflows/coverage.yaml +++ b/.github/workflows/coverage.yaml @@ -8,7 +8,7 @@ on: jobs: coverage: timeout-minutes: 5 - runs-on: ubuntu-latest + runs-on: linux-ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v3 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 144e3f1ad3..c09f9bb7a5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -14,7 +14,7 @@ jobs: name: Build and Publish llm-foundry PyPI Package needs: - code-quality - runs-on: ubuntu-latest + runs-on: linux-ubuntu-latest steps: - name: Checkout source uses: actions/checkout@v3 diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml index 2163111710..d38849cddc 100644 --- a/.github/workflows/smoketest.yaml +++ b/.github/workflows/smoketest.yaml @@ -18,7 +18,7 @@ defaults: working-directory: . jobs: smoketest: - runs-on: ubuntu-20.04 + runs-on: linux-ubuntu-latest timeout-minutes: 20 strategy: matrix: From d812f20c5472f771f06a9dd561a31d3a88ed26bf Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:02:09 -0400 Subject: [PATCH 19/57] Allow for multiple workers when autopacking (#1375) --- llmfoundry/data/packing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index a6fdf34953..0c5cb1418b 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -424,8 +424,6 @@ def profile_packing( dataloader_cfg = copy.deepcopy(dataloader_cfg) dataloader_cfg.update({ 'drop_last': False, - 'num_workers': 0, - 'prefetch_factor': None, 'persistent_workers': False, }) dataloader_cfg['dataset']['packing_ratio'] = 1.0 From eb41a6e798cbbd4907e42e4b45ce42bd86c88889 Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Mon, 22 Jul 2024 17:03:13 -0700 Subject: [PATCH 20/57] Allow train.py-like config for eval.py (#1351) * Allow model key in eval script * Compatibility * pre-commit fix * Fix load_path * fix * Refactor as a config transform * formatting * fix * fix pyright * fix --------- Co-authored-by: Mihir Patel --- llmfoundry/command_utils/eval.py | 49 ++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index 7d8306c0a0..bddd592dba 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -175,6 +175,54 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) +def allow_toplevel_keys(cfg: Dict[str, Any]) -> Dict[str, Any]: + """Transform the config to allow top-level keys for model configuration. + + This function allows users to use the 'train.py' syntax in 'eval.py'. + It converts a config with top-level 'model', 'tokenizer', and (optionally) 'load_path' keys + into the nested 'models' list format required by 'eval.py'. + + Input config format (train.py style): + ```yaml + model: + + load_path: /path/to/checkpoint + tokenizer: + + ``` + + Output config format (eval.py style): + ```yaml + models: + - model: + + tokenizer: + + load_path: /path/to/checkpoint + ``` + """ + if 'model' in cfg: + if 'models' in cfg: + raise ValueError( + 'Please specify either model or models in the config, not both', + ) + default_name = cfg.get('model').get('name') # type: ignore + model_cfg = { + 'model': cfg.pop('model'), + 'tokenizer': cfg.pop('tokenizer', None), + 'model_name': cfg.pop('model_name', default_name), + } + if 'tokenizer' not in model_cfg or model_cfg['tokenizer'] is None: + raise ValueError( + 'When specifying model, "tokenizer" must be provided in the config', + ) + if 'load_path' in cfg: + model_cfg['load_path'] = cfg.pop('load_path') + cfg['models'] = [model_cfg] + + return cfg + + def evaluate(cfg: DictConfig) -> Tuple[list[Trainer], pd.DataFrame]: # Run user provided code if specified for code_path in cfg.get('code_paths', []): @@ -184,6 +232,7 @@ def evaluate(cfg: DictConfig) -> Tuple[list[Trainer], pd.DataFrame]: cfg, EvalConfig, EVAL_CONFIG_KEYS, + transforms=[allow_toplevel_keys], icl_tasks_required=True, ) From 0bed4ffb3cbf73260e0925ab473a579c16921703 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jul 2024 17:52:22 -0700 Subject: [PATCH 21/57] fix load and save planner config logic (#1385) --- llmfoundry/command_utils/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index feed1e9fb1..77bb9dbcfe 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -260,7 +260,7 @@ def train(cfg: DictConfig) -> Trainer: if fsdp_config is not None: if 'load_planner' in fsdp_config: - load_planners = fsdp_config['load_planner'].items() + load_planners = list(fsdp_config['load_planner'].items()) if len(load_planners) > 1: raise ValueError( 'Only one load planner can be specified in the config.', @@ -272,7 +272,7 @@ def train(cfg: DictConfig) -> Trainer: ) if 'save_planner' in fsdp_config: - save_planners = fsdp_config['save_planner'].items() + save_planners = list(fsdp_config['save_planner'].items()) if len(save_planners) > 1: raise ValueError( 'Only one save planner can be specified in the config.', From 596dd9dcef1df60bee39c62868b48b2d82d7cb28 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jul 2024 18:39:35 -0700 Subject: [PATCH 22/57] Do dtype conversion in torch hook to save memory (#1384) * Do dtype conversion in torch hook to save memory * update code comment Co-authored-by: Saaketh Narayan --------- Co-authored-by: Saaketh Narayan --- llmfoundry/callbacks/hf_checkpointer.py | 26 +++++++++---------- .../inference/test_convert_composer_to_hf.py | 2 ++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 2ade458bb4..7127d37f40 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -435,8 +435,8 @@ def _save_checkpoint(self, state: State, logger: Logger): cpu_offload = True - # Add a dtensor->cpu tensor hook to avoid CUDA OOM - def dtensor_to_tensor_hook( + # Add hook to move tensors to cpu to avoid CUDA OOM + def tensor_hook( module: nn.Module, state_dict: Dict[str, Any], prefix: str, @@ -449,20 +449,23 @@ def dtensor_to_tensor_hook( dtensor_fqns.append(fqn) tensor = tensor.full_tensor() # type: ignore if dist.get_global_rank() == 0: + # Offload any DTensors to CPU if cpu_offload: tensor = tensor.cpu() state_dict[fqn] = tensor + else: + state_dict[fqn] = None + # Convert the state dict to the requested precision + if isinstance(tensor, torch.Tensor): + state_dict[fqn] = tensor.to(dtype=self.dtype) + del tensor if dist.get_global_rank() != 0: - for fqn in dtensor_fqns: - del state_dict[fqn] + state_dict = {} return state_dict hooks = [] for _, module in state_dict_model.named_modules(): - if isinstance(module, FSDP): - hooks.append( - module._register_state_dict_hook(dtensor_to_tensor_hook), - ) + hooks.append(module._register_state_dict_hook(tensor_hook),) state_dict = get_model_state_dict( state_dict_model, @@ -474,11 +477,6 @@ def dtensor_to_tensor_hook( for hook in hooks: hook.remove() - # Convert the state dict to the requested precision - for k, v in state_dict.items(): - if isinstance(v, torch.Tensor): - state_dict[k] = v.to(dtype=self.dtype) - new_model_instance = None # Need this for pyright because variable could be unbound if dist.get_global_rank() == 0: @@ -537,7 +535,7 @@ def dtensor_to_tensor_hook( original_tokenizer.save_pretrained(temp_save_dir) # Only need to edit files for MPT because it has custom code - if original_model.config.model_type == 'mpt': + if new_model_instance.config.model_type == 'mpt': log.debug('Editing MPT files for HuggingFace compatibility') edit_files_for_hf_compatibility( temp_save_dir, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 68dc855154..ffdb09ca98 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval( mlflow_logger_mock.model_registry_prefix = '' mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' mlflow_logger_mock._run_id = 'mlflow-run-id' + mlflow_logger_mock._enabled = True + mlflow_logger_mock.run_url = 'fake-url' checkpointer_callback.transform_model_pre_registration = MagicMock( wraps=checkpointer_callback.transform_model_pre_registration, ) From d2d29adad17ae1fc48a294dfd5c4fa4d3e63e809 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 04:00:18 -0700 Subject: [PATCH 23/57] Use utils to get shared fs safe signal file name (#1381) --- llmfoundry/data/finetuning/dataloader.py | 55 +++++++++--------------- llmfoundry/data/finetuning/tasks.py | 2 +- llmfoundry/models/hf/hf_causal_lm.py | 2 +- llmfoundry/utils/builders.py | 2 +- 4 files changed, 23 insertions(+), 38 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 11104ac706..60052acdc5 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -534,42 +534,27 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - signal_file_path = os.path.join( - finetune_dir, - f'.node_{dist.get_node_rank()}_local_rank0_completed', - ) - if dist.get_local_rank() == 0: - try: - get_file(path=name, destination=destination, overwrite=True) - except FileNotFoundError as e: - if extension == SUPPORTED_EXTENSIONS[-1]: - files_searched = [ - f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS - ] - raise FileNotFoundError( - f'Could not find a file with any of ' + \ - f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ - f'at {files_searched}', - ) from e - else: - log.debug( - f'Could not find {name}, looking for another extension', - ) - continue - - os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) - with open(signal_file_path, 'wb') as f: - f.write(b'local_rank0_completed_download') - - # Avoid the collective call until the local rank zero has finished trying to download the dataset - # so that we don't timeout for large downloads. This syncs all processes on the node - with dist.local_rank_zero_download_and_wait(signal_file_path): - # Then, wait to ensure every node has finished trying to download the dataset - dist.barrier() + with dist.busy_wait_for_local_rank_zero(finetune_dir): + if dist.get_local_rank() == 0: + try: + get_file(path=name, destination=destination, overwrite=True) + except FileNotFoundError as e: + if extension == SUPPORTED_EXTENSIONS[-1]: + files_searched = [ + f'{name}/{split}{ext}' + for ext in SUPPORTED_EXTENSIONS + ] + raise FileNotFoundError( + f'Could not find a file with any of ' + \ + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ + f'at {files_searched}', + ) from e + else: + log.debug( + f'Could not find {name}, looking for another extension', + ) + continue - # clean up signal file - if dist.get_local_rank() == 0: - os.remove(signal_file_path) dist.barrier() break return finetune_dir diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 78bfb9c74c..d5af632952 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -827,7 +827,7 @@ def build_from_hf( Returns: Dataset: The tokenized dataset. """ - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed' + signal_file_path = dist.get_node_signal_file_name() # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing. # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index e3fa8d03a3..a15429aa06 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -363,7 +363,7 @@ def _autoset_attn_implementation_monkeypatch( f'init_device="{init_device}" must be either "cpu" or "meta".', ) - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed' + signal_file_path = dist.get_node_signal_file_name() if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 0437736f74..b889155be0 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -498,7 +498,7 @@ def build_tokenizer( os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' + signal_file_path = dist.get_node_signal_file_name() if dist.is_available() and dist.is_initialized( ) and dist.get_world_size() > 1: From cefd616048d52ed714cd5e982a54eaaf9aa38707 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 23 Jul 2024 10:42:00 -0700 Subject: [PATCH 24/57] Add transformation hooks to hf_causal_lm (#1383) --- llmfoundry/models/hf/hf_causal_lm.py | 72 ++++++++------------------ llmfoundry/utils/config_utils.py | 42 +++++++++++++++ tests/models/hf/test_hf_transform.py | 76 ++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 51 deletions(-) create mode 100644 tests/models/hf/test_hf_transform.py diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index a15429aa06..7c0baf0c58 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -11,7 +11,6 @@ Any, Dict, List, - Mapping, Optional, Tuple, Union, @@ -23,7 +22,6 @@ from transformers import ( AutoConfig, AutoModelForCausalLM, - PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase, ) @@ -36,7 +34,7 @@ from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.utils import init_empty_weights -from llmfoundry.utils.config_utils import get_hf_config_value +from llmfoundry.utils.config_utils import set_config_overrides if TYPE_CHECKING: from peft import PeftConfig, PeftModel @@ -105,9 +103,13 @@ def __init__( config_overrides=config_overrides, load_in_8bit=load_in_8bit, pretrained=pretrained, - prepare_for_fsdp=True, + prepare_for_fsdp=False, ) + model = self.transform_model(model) + + ComposerHFCausalLM.prepare_inner_model(model, init_device) + train_metrics, eval_metrics = ComposerHFCausalLM.build_metrics( use_train_metrics=use_train_metrics, additional_train_metrics=additional_train_metrics, @@ -121,7 +123,7 @@ def __init__( peft_config_object = None if peft_config is not None: - peft_config_object = self._get_peft_config(peft_config) + peft_config_object = self.get_peft_config(peft_config) # Set up config args for the model construction and base classes super().__init__( @@ -135,6 +137,17 @@ def __init__( should_save_peft_only=should_save_peft_only, ) + def transform_model(self, model: PreTrainedModel) -> PreTrainedModel: + """Transforms the model after initialization. + + Args: + model (PreTrainedModel): The model to transform. + + Returns: + PreTrainedModel: The transformed model. + """ + return model + @staticmethod def build_metrics( use_train_metrics: bool, @@ -259,50 +272,7 @@ def _autoset_attn_implementation_monkeypatch( _autoset_attn_implementation_monkeypatch, ) - # set config overrides - for k, v in config_overrides.items(): - if not hasattr(config, k): - raise ValueError( - f'config does not have attribute "{k}" to override ({k}: {v}).', - ) - - attr = getattr(config, k) - # attempt to disallow typos in nested configs - if isinstance(attr, Mapping): - extra_keys = [_k for _k in v.keys() if _k not in attr.keys()] - if extra_keys: - raise ValueError( - f'Config dict override got unknown keys. ' + - f'Extra keys: {extra_keys}. ' + - f'Expected (a subset of) keys: {list(attr.keys())}.', - ) - getattr(config, k).update(v) - # necessary case to allow for rope_scaling to be overriden in llama config - elif attr is None and isinstance(v, Mapping): - setattr(config, k, {}) - getattr(config, k).update(v) - elif isinstance(attr, PretrainedConfig): - if not isinstance(v, Mapping): - raise ValueError( - f'Expected a dictionary for config override {k}, but got {v}.', - ) - - for _k, _v in v.items(): - if not hasattr(attr, _k): - raise ValueError( - f'config does not have attribute "{_k}" to override ({k}: {_k}: {_v}).', - ) - setattr(attr, _k, _v) - else: - setattr(config, k, v) - - if hasattr(config, 'attn_config') and get_hf_config_value( - config.attn_config, - 'seq_parallel_world_size', - ) is not None: - raise NotImplementedError( - 'Sequence Parallelism is not supported for HuggingFace models.', - ) + set_config_overrides(config, config_overrides) # We need to have all non-zero local ranks be not-pretrained # Rank 0 will still be pretrained, and distribute the weights appropriately @@ -395,10 +365,10 @@ def _autoset_attn_implementation_monkeypatch( if prepare_for_fsdp: ComposerHFCausalLM.prepare_inner_model(model, init_device) + return model - @staticmethod - def _get_peft_config(peft_config_dict: Dict[str, Any]) -> 'PeftConfig': + def get_peft_config(self, peft_config_dict: Dict[str, Any]) -> 'PeftConfig': if peft_installed: from peft import LoraConfig peft_type = peft_config_dict.get('peft_type', '') diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 4b86de99b8..48290bd7c5 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -812,3 +812,45 @@ def _verify_uc_path(path: str) -> bool: f'but your `UCVolumeDatasetSource` might be invalid.', ) return False + + +def set_config_overrides( + config: PretrainedConfig, + config_overrides: Dict[str, Any], +): + # set config overrides + for k, v in config_overrides.items(): + if not hasattr(config, k): + raise ValueError( + f'config does not have attribute "{k}" to override ({k}: {v}).', + ) + + attr = getattr(config, k) + # attempt to disallow typos in nested configs + if isinstance(attr, Mapping): + extra_keys = [_k for _k in v.keys() if _k not in attr.keys()] + if extra_keys: + raise ValueError( + f'Config dict override got unknown keys. ' + + f'Extra keys: {extra_keys}. ' + + f'Expected (a subset of) keys: {list(attr.keys())}.', + ) + getattr(config, k).update(v) + # necessary case to allow for rope_scaling to be overriden in llama config + elif attr is None and isinstance(v, Mapping): + setattr(config, k, {}) + getattr(config, k).update(v) + elif isinstance(attr, PretrainedConfig): + if not isinstance(v, Mapping): + raise ValueError( + f'Expected a dictionary for config override {k}, but got {v}.', + ) + + for _k, _v in v.items(): + if not hasattr(attr, _k): + raise ValueError( + f'config does not have attribute "{_k}" to override ({k}: {_k}: {_v}).', + ) + setattr(attr, _k, _v) + else: + setattr(config, k, v) diff --git a/tests/models/hf/test_hf_transform.py b/tests/models/hf/test_hf_transform.py new file mode 100644 index 0000000000..f479b50f73 --- /dev/null +++ b/tests/models/hf/test_hf_transform.py @@ -0,0 +1,76 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, Optional + +import pytest +from composer.models.huggingface import maybe_get_underlying_model +from peft import PeftConfig, PeftModel +from transformers import LlamaForCausalLM, PreTrainedModel + +from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM +from llmfoundry.models.utils import init_empty_weights + + +@pytest.mark.gpu +@pytest.mark.parametrize( + 'peft_config', + [ + None, + { + 'peft_type': 'LORA', + 'task_type': 'CAUSAL_LM', + 'lora_alpha': 32, + 'r': 2, + 'target_modules': [ + 'q_proj', + 'k_proj', + 'v_proj', + ], + }, + ], +) +def test_hf_transform(peft_config: Optional[dict]): + model_cfg = { + 'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf', + 'config_overrides': { + 'num_hidden_layers': 2, + 'hidden_size': 32, + 'intermediate_size': 64, + }, + 'pretrained': False, + 'peft_config': peft_config, + 'init_device': 'meta', + 'tokenizer': 'codellama/CodeLlama-7b-hf', + } + + class TransformedHFCausalLM(ComposerHFCausalLM): + + def transform_model(self, model: PreTrainedModel) -> PreTrainedModel: + assert isinstance(model, LlamaForCausalLM) + with init_empty_weights(): + model.config.num_hidden_layers = 1 + new_model = type(model)(model.config) + return new_model + + def get_peft_config( + self, + peft_config_dict: Dict[str, Any], + ) -> PeftConfig: + peft_config_dict['target_modules'] = ['o_proj'] + return super().get_peft_config(peft_config_dict) + + composer_model = TransformedHFCausalLM(**model_cfg) + model = composer_model.model + inner_model = maybe_get_underlying_model(model) + + if peft_config: + peft_model = composer_model.model + assert isinstance(peft_model, PeftModel) + + target_modules = peft_model.peft_config[peft_model.active_adapter + ].target_modules + assert list(target_modules) == ['o_proj'] + + assert isinstance(inner_model, LlamaForCausalLM) + assert inner_model.config.num_hidden_layers == 1 From 51949c4c0d7a1bab0e2112921e11a51e947b544e Mon Sep 17 00:00:00 2001 From: Kushal Kodnad <170473237+kushalkodn-db@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:15:40 -0700 Subject: [PATCH 25/57] [kushalkodnad/tokenizer-registry] Introduce new registry for tokenizers (#1386) --- llmfoundry/registry.py | 14 +++++++++++++ llmfoundry/tokenizers/__init__.py | 3 +++ llmfoundry/utils/builders.py | 12 ++++++++--- tests/test_registry.py | 1 + tests/tokenizers/test_registry.py | 35 +++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 tests/tokenizers/test_registry.py diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index e31840d3fb..3f0163ff01 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -155,6 +155,19 @@ description=_schedulers_description, ) +_tokenizers_description = ( + 'The tokenizers registry is used to register tokenizers that implement the transformers.PreTrainedTokenizerBase interface. ' + + + 'The tokenizer will be passed to the build_dataloader() and build_composer_model() methods in train.py.' +) +tokenizers = create_registry( + 'llmfoundry', + 'tokenizers', + generic_type=Type[PreTrainedTokenizerBase], + entry_points=True, + description=_tokenizers_description, +) + _models_description = ( """The models registry is used to register classes that implement the ComposerModel interface. @@ -383,6 +396,7 @@ 'optimizers', 'algorithms', 'schedulers', + 'tokenizers', 'models', 'dataset_replication_validators', 'collators', diff --git a/llmfoundry/tokenizers/__init__.py b/llmfoundry/tokenizers/__init__.py index 1703ed8862..d37c12a555 100644 --- a/llmfoundry/tokenizers/__init__.py +++ b/llmfoundry/tokenizers/__init__.py @@ -1,8 +1,11 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from llmfoundry.registry import tokenizers from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper +tokenizers.register('tiktoken', func=TiktokenTokenizerWrapper) + __all__ = [ 'TiktokenTokenizerWrapper', ] diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index b889155be0..cf27e7660e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -37,7 +37,6 @@ from llmfoundry.data.dataloader import build_dataloader from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader -from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper from llmfoundry.utils.config_utils import to_dict_container, to_list_container from llmfoundry.utils.registry_utils import construct_from_registry @@ -506,8 +505,15 @@ def build_tokenizer( with dist.local_rank_zero_download_and_wait(signal_file_path): pass - if tokenizer_name.startswith('tiktoken'): - tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs) + if tokenizer_name in registry.tokenizers: + tokenizer = construct_from_registry( + name=tokenizer_name, + registry=registry.tokenizers, + partial_function=True, + pre_validation_function=PreTrainedTokenizerBase, + post_validation_function=None, + kwargs=tokenizer_kwargs, + ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, diff --git a/tests/test_registry.py b/tests/test_registry.py index aa0c93ee13..c4d1a1bcd5 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -24,6 +24,7 @@ def test_expected_registries_exist(): 'loggers', 'optimizers', 'schedulers', + 'tokenizers', 'callbacks', 'algorithms', 'callbacks_with_config', diff --git a/tests/tokenizers/test_registry.py b/tests/tokenizers/test_registry.py new file mode 100644 index 0000000000..920c207a64 --- /dev/null +++ b/tests/tokenizers/test_registry.py @@ -0,0 +1,35 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, Optional + +from transformers import PreTrainedTokenizer + +from llmfoundry.registry import tokenizers +from llmfoundry.utils import build_tokenizer + + +class DummyTokenizer(PreTrainedTokenizer): + """A dummy tokenizer that inherits from ``PreTrainedTokenizer``.""" + + def __init__( + self, + model_name: Optional[str] = 'dummy', + **kwargs: Optional[Dict[str, Any]], + ): + """Dummy constructor that has no real purpose.""" + super().__init__( + model_name=model_name, + eos_token='0', + pad_token='1', + **kwargs, + ) + + def get_vocab(self) -> Dict[str, int]: + return {} + + +def test_tokenizer_registry(): + tokenizers.register('dummy', func=DummyTokenizer) + tokenizer = build_tokenizer(tokenizer_name='dummy', tokenizer_kwargs={}) + assert type(tokenizer) == DummyTokenizer From d49e6a27f4591c0572f3509c508092ed4af6b571 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:36:57 -0700 Subject: [PATCH 26/57] bump transformers (#1388) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b4948e9a43..4563b4dfb8 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24', 'mlflow>=2.14.1,<2.15', 'accelerate>=0.25,<0.33', # for HF inference `device_map` - 'transformers>=4.42.3,<4.43', + 'transformers>=4.43.1,<4.44', 'mosaicml-streaming>=0.7.6,<0.8', 'torch>=2.3.0,<2.4', 'datasets>=2.19,<2.20', From 3d7d12e676b64a46eb1cc335960779e7edc5728a Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jul 2024 11:56:59 -0700 Subject: [PATCH 27/57] Add convert_delta_to_json to CLI (#1355) --- llmfoundry/cli/data_prep_cli.py | 26 + llmfoundry/command_utils/__init__.py | 6 + .../data_prep/convert_delta_to_json.py | 758 ++++++++++++++++++ scripts/data_prep/convert_delta_to_json.py | 662 +-------------- .../data_prep/test_convert_delta_to_json.py | 348 +++++--- 5 files changed, 1024 insertions(+), 776 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/convert_delta_to_json.py diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py index b2a4af4521..130e0a6585 100644 --- a/llmfoundry/cli/data_prep_cli.py +++ b/llmfoundry/cli/data_prep_cli.py @@ -1,6 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os from typing import Annotated, Optional import psutil @@ -9,6 +10,7 @@ from llmfoundry.command_utils import ( convert_dataset_hf_from_args, convert_dataset_json_from_args, + convert_delta_to_json_from_args, convert_finetuning_dataset_from_args, convert_text_to_mds_from_args, ) @@ -240,3 +242,27 @@ def convert_text_to_mds( trust_remote_code=trust_remote_code, logging_level=logging_level, ) + + +@app.command(name='convert_delta_to_json') +def convert_delta_to_json_cli( + delta_table_name: Annotated[str, Option(..., help='UC table ..')], + json_output_folder: Annotated[str, Option(..., help='Local path to save the converted json')], + http_path: Annotated[Optional[str], Option(help='If set, dbsql method is used')] = None, + batch_size: Annotated[int, Option(help='Row chunks to transmit a time to avoid OOM')] = 1 << 30, + processes: Annotated[int, Option(help='Number of processes allowed to use')] = os.cpu_count(), # type: ignore + cluster_id: Annotated[Optional[str], Option(help='Cluster ID with runtime newer than 14.1.0 and access mode of either assigned or shared can use databricks-connect.')] = None, + use_serverless: Annotated[bool, Option(help='Use serverless or not. Make sure the workspace is entitled with serverless')] = False, + json_output_filename: Annotated[str, Option(help='The name of the combined final jsonl that combines all partitioned jsonl')] = 'train-00000-of-00001.jsonl', +): + """Convert a Delta table into JSON files.""" + convert_delta_to_json_from_args( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + batch_size=batch_size, + processes=processes, + cluster_id=cluster_id, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + ) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 3a99f09f86..0226c4f408 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -8,6 +8,10 @@ convert_dataset_json, convert_dataset_json_from_args, ) +from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( + convert_delta_to_json_from_args, + fetch_DT, +) from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import ( convert_finetuning_dataset, convert_finetuning_dataset_from_args, @@ -44,4 +48,6 @@ 'convert_finetuning_dataset', 'convert_text_to_mds', 'convert_text_to_mds_from_args', + 'convert_delta_to_json_from_args', + 'fetch_DT', ] diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py new file mode 100644 index 0000000000..b76e457e2c --- /dev/null +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -0,0 +1,758 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import re +import time +import urllib.parse +from collections import namedtuple +from concurrent.futures import ProcessPoolExecutor +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union +from uuid import uuid4 + +import google.protobuf.any_pb2 as any_pb2 +import pandas as pd +import pyarrow as pa +import requests +from composer.utils import retry +from packaging import version + +from llmfoundry.utils.exceptions import ( + ClusterDoesNotExistError, + FailedToConnectToDatabricksError, + FailedToCreateSQLConnectionError, +) + +if TYPE_CHECKING: + import pyspark.sql.connect.proto as pb2 + from databricks.sql.client import Connection as Connection + from databricks.sql.client import Cursor as Cursor + from pyspark.sql import SparkSession + from pyspark.sql.connect.client.core import SparkConnectClient + from pyspark.sql.connect.dataframe import DataFrame + from pyspark.sql.dataframe import DataFrame as SparkDataFrame + from pyspark.sql.types import Row + +try: + from pyspark.sql.connect.client.core import SparkConnectClient + spark_connect_client_installed = True +except ImportError: + spark_connect_client_installed = False + +try: + from pyspark.sql.connect.dataframe import DataFrame + data_frame_installed = True +except ImportError: + data_frame_installed = False + +MINIMUM_DB_CONNECT_DBR_VERSION = '14.1' +MINIMUM_SQ_CONNECT_DBR_VERSION = '12.2' + +TABLENAME_PATTERN = re.compile(r'(\S+)\.(\S+)\.(\S+)') + +log = logging.getLogger(__name__) + +Result = namedtuple( + 'Result', + [ + 'url', + 'row_count', + 'compressed_size', + 'uncompressed_size', + ], +) # pyright: ignore + +# ``collect_as_cf`` is an addon new feature monkey patch on top of the DB Connect package. +# It allows the client to fetch the results in different formats from the server. +# To be able to use the code make sure this module is not overriden by DB Connect classes. + + +def to_cf(self: 'SparkConnectClient', + plan: 'pb2.Plan', + type: str = 'json') -> Tuple[List[Result], int, bool]: + """Executes the query plans and return as presigned URLS for cloud fetch. + + It can handle the current output formats that are supported by the server. + In contrast to the regular API methods of the client, this method does not + return the schema and drops all other responses. + + Args: + plan (pb2.Plan): The plan object to be executed by spark. + type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. + + Returns: + Tuple[List[Result], int, bool]: A tuple containing: + - A list of Result namedtuples, each containing a URL, row count, compressed size, + and uncompressed size of the part of the result. + - Total row count of all parts of the result. + - A boolean indicating whether the result has been truncated. + """ + req = self._execute_plan_request_with_metadata() + req.plan.CopyFrom(plan) + + import pyspark.sql.connect.proto as pb2 + import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2 + + # Add the request options + if type == 'json': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_JSON + elif type == 'csv': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_CSV + elif type == 'arrow': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_ARROW + else: + raise ValueError( + f'Only formats json, csv, and arrow are supported. Got invalid type {type}', + ) + + ro = cloud_pb2.ResultOptions( + type=cloud_pb2.ResultOptions.TYPE_CLOUD, + cloudOptions=cloud_pb2.ResultOptions.CloudOptions( + format=format, + useCompression=False, + ), + ) + cloud_option = any_pb2.Any() + cloud_option.Pack(ro) + req.request_options.append( + pb2.ExecutePlanRequest.RequestOption(extension=cloud_option), + ) + + # Create the iterator + from pyspark.sql.connect.client.reattach import \ + ExecutePlanResponseReattachableIterator + iterator = ExecutePlanResponseReattachableIterator( + req, + self._stub, + self._retry_policy, + self._builder.metadata(), + ) + # Iterate over the response + result = [] + row_count = 0 + is_overflow = False + + for response in iterator: + if response.HasField('extension') and response.extension.Is( + cloud_pb2.CloudResultBatch.DESCRIPTOR, + ): + batch = cloud_pb2.CloudResultBatch() + if not response.extension.Is(cloud_pb2.CloudResultBatch.DESCRIPTOR): + raise ValueError( + 'Response extension is not of type CloudResultBatch.', + ) + response.extension.Unpack(batch) + result += [ + Result( + b.url, + b.row_count, + b.compressed_size, + b.uncompressed_size, + ) for b in batch.results + ] + row_count += sum(result.row_count for result in batch.results) + is_overflow |= batch.truncated + return result, row_count, is_overflow + + +if spark_connect_client_installed: + SparkConnectClient.to_cf = to_cf # pyright: ignore + + +def collect_as_cf(self: 'DataFrame', + type: str = 'json') -> Tuple[List[Result], int, bool]: + """Collects DataFrame execution plan as presigned URLs. + + This method is a wrapper around the `to_cf` method of SparkConnectClient. It takes the + execution plan of the current DataFrame, converts it to a protocol buffer format, and then + uses the `to_cf` method to execute the plan and fetch results as presigned URLs. + + Args: + type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. + + Returns: + Tuple[List[Result], int, bool]: A tuple containing: + - A list of Result namedtuples, each containing a URL, row count, compressed size, + and uncompressed size of the part of the result. + - Total row count of all parts of the result. + - A boolean indicating whether the result is truncated or overflowed. + """ + query = self._plan.to_proto(self._session.client) # pyright: ignore + return self._session.client.to_cf(query, type) # pyright: ignore + + +if data_frame_installed: + DataFrame.collect_cf = collect_as_cf # pyright: ignore + + +def iterative_combine_jsons(json_directory: str, output_file: str) -> None: + """Combine jsonl files in json_directory into one big jsonl file. + + This function does not work for nested subdirectories. + + Args: + json_directory(str): directory containing the JSONL files + output_file(str): path to the output combined JSONL file + """ + json_files = [f for f in os.listdir(json_directory) if f.endswith('.jsonl')] + with open(output_file, 'w') as outfile: + for file_name in json_files: + with open(os.path.join(json_directory, file_name), 'r') as infile: + for line in infile: + outfile.write(line) + log.info('JSON files have been combined into a JSONL file.') + + +def run_query( + query: str, + method: str, + cursor: Optional['Cursor'] = None, + spark: Optional['SparkSession'] = None, + collect: bool = True, +) -> Optional[Union[List['Row'], 'DataFrame', 'SparkDataFrame']]: + """Run SQL query via databricks-connect or databricks-sql. + + Args: + query (str): sql query + method (str): select from dbsql and dbconnect + cursor (Optional[Cursor]): connection.cursor + spark (Optional[SparkSession]): spark session + collect (bool): whether to get the underlying data from spark dataframe + """ + if method == 'dbsql': + if cursor is None: + raise ValueError(f'cursor cannot be None if using method dbsql') + cursor.execute(query) + if collect: + return cursor.fetchall() + elif method == 'dbconnect': + if spark == None: + raise ValueError(f'sparkSession is required for dbconnect') + df = spark.sql(query) + if collect: + return df.collect() + return df + else: + raise ValueError(f'Unrecognized method: {method}') + + +def get_args(signed: List, json_output_folder: str, columns: List) -> Iterable: + for i, r in enumerate(signed): + yield (i, r.url, json_output_folder, columns) + + +def download( + ipart: int, + url: str, + json_output_folder: str, + columns: Optional[List] = None, + resp_format: str = 'arrow', + compressed: bool = False, +) -> None: + """Thread download presigned url and save to jsonl locally. + + Args: + ipart (int): presigned url id + url (str): presigned url + json_output_folder (str): directory to save the ipart_th segment of dataframe + columns (list): schema to save to json + resp_format (str): whether to use arrow or json when collect + compressed (bool): if data is compressed before downloading. Need decompress if compressed=True. + """ + resp = requests.get(url) + if resp.status_code == 200: + if resp_format == 'json': + data = resp.json() + pd.DataFrame(data, columns=columns).to_json( + os.path.join( + json_output_folder, + 'part_' + str(ipart) + '.jsonl', + ), + orient='records', + lines=True, + ) + return + + # When resp_format is arrow: + if compressed: + # The data is lz4 compressed arrow format. + # Decompress the data + import lz4.frame + decompressed_data = lz4.frame.decompress(resp.content) + # Convert the decompressed data into a PyArrow table + reader = pa.ipc.open_stream(decompressed_data) + else: + reader = pa.ipc.open_stream(resp.content) + table = reader.read_all() + + # Convert the PyArrow table into a pandas DataFrame + df = table.to_pandas() + df.to_json( + os.path.join(json_output_folder, 'part_' + str(ipart) + '.jsonl'), + orient='records', + lines=True, + force_ascii=False, + ) + + +def download_starargs(args: Tuple) -> None: + return download(*args) + + +def format_tablename(table_name: str) -> str: + """Escape catalog, schema and table names with backticks. + + This needs to be done when running SQL queries/setting spark sessions to prevent invalid identifier errors. + + Args: + table_name (str): catalog.scheme.tablename on UC + """ + match = re.match(TABLENAME_PATTERN, table_name) + + if match is None: + return table_name + + formatted_identifiers = [] + for i in range(1, 4): + identifier = f'`{match.group(i)}`' + formatted_identifiers.append(identifier) + + return '.'.join(formatted_identifiers) + + +def fetch_data( + method: str, + cursor: Optional['Cursor'], + sparkSession: Optional['SparkSession'], + start: int, + end: int, + order_by: str, + tablename: str, + columns_str: str, + json_output_folder: str, +) -> None: + """Fetches a specified range of rows from a given table to a json file. + + This function executes a SQL query to retrieve a range of rows, determined by 'start' and 'end' indexes, + from a specified table and column set. The fetched data is then exported as a JSON file. + + Args: + method (str): The method to use for fetching data, either 'dbconnect' or 'dbsql'. + cursor (Optional[Cursor]): The cursor object for executing queries in 'dbsql' method. + sparkSession (Optional[SparkSession]): The Spark session object for executing queries in 'dbconnect' method. + start (int): The starting index for row fetching. + end (int): The ending index for row fetching. + order_by (str): The column name to use for ordering the rows. + tablename (str): The name of the table from which to fetch the data. + columns_str (str): The string representation of the columns to select from the table. + json_output_folder (str): The file path where the resulting JSON file will be saved. + + Returns: + None: The function doesn't return any value, but writes the result to a JSONL file. + """ + query = f""" + WITH NumberedRows AS ( + SELECT + *, + ROW_NUMBER() OVER (ORDER BY {order_by}) AS rn + FROM + {tablename} + ) + SELECT {columns_str} + FROM NumberedRows + WHERE rn BETWEEN {start+1} AND {end}""" + + if method == 'dbconnect': + spark_df = run_query(query, method, cursor, sparkSession, collect=False) + if spark_df is None: + raise RuntimeError( + f'Expect spark dataframe with {query} but got None', + ) + pdf = spark_df.toPandas() # pyright: ignore + else: # method == 'dbsql': + ans = run_query(query, method, cursor, sparkSession, collect=True) + if ans is None: + raise RuntimeError(f'Got empty results with {query}') + records = [r.asDict() for r in ans] # pyright: ignore + pdf = pd.DataFrame.from_dict(records) + + pdf.to_json( + os.path.join(json_output_folder, f'part_{start+1}_{end}.jsonl'), + orient='records', + lines=True, + ) + + +@retry(Exception, num_attempts=5, initial_backoff=1.0, max_jitter=0.5) +def get_total_rows( + tablename: str, + method: str, + cursor: Optional['Cursor'], + sparkSession: Optional['SparkSession'], +): + ans = run_query( + f'SELECT COUNT(*) FROM {tablename}', + method, + cursor, + sparkSession, + ) + nrows = [row.asDict() for row in ans][0].popitem()[1] # pyright: ignore + log.info(f'total_rows = {nrows}') + return nrows + + +@retry(Exception, num_attempts=5, initial_backoff=1.0, max_jitter=0.5) +def get_columns_info( + tablename: str, + method: str, + cursor: Optional['Cursor'], + sparkSession: Optional['SparkSession'], +): + ans = run_query( + f'SHOW COLUMNS IN {tablename}', + method, + cursor, + sparkSession, + ) + columns = [row.asDict().popitem()[1] for row in ans] # pyright: ignore + order_by = columns[0] + columns_str = ','.join(columns) + log.info(f'order by column {order_by}') + return columns, order_by, columns_str + + +def fetch( + method: str, + tablename: str, + json_output_folder: str, + batch_size: int = 1 << 30, + processes: int = 1, + sparkSession: Optional['SparkSession'] = None, + dbsql: Optional['Connection'] = None, +) -> None: + """Fetch UC delta table with databricks-connect as JSONL. + + Args: + method (str): dbconnect or dbsql + tablename (str): catalog.scheme.tablename on UC + json_output_folder (str): path to write the result json file to + batch_size (int): number of rows that dbsql fetches each time to avoid OOM + processes (int): max number of processes to use to parallelize the fetch + sparkSession (pyspark.sql.sparksession): spark session + dbsql (databricks.sql.connect): dbsql session + """ + cursor = dbsql.cursor() if dbsql is not None else None + try: + nrows = get_total_rows( + tablename, + method, + cursor, + sparkSession, + ) + except Exception as e: + raise RuntimeError( + f'Error in get rows from {tablename}. Restart sparkSession and try again', + ) from e + + try: + columns, order_by, columns_str = get_columns_info( + tablename, + method, + cursor, + sparkSession, + ) + except Exception as e: + raise RuntimeError( + f'Error in get columns from {tablename}. Restart sparkSession and try again', + ) from e + + if method == 'dbconnect' and sparkSession is not None: + log.info(f'{processes=}') + df = sparkSession.table(tablename) + + # Running the query and collecting the data as arrow or json. + signed, _, _ = df.collect_cf('arrow') # pyright: ignore + log.info(f'len(signed) = {len(signed)}') + + args = get_args(signed, json_output_folder, columns) + + # Stopping the SparkSession to avoid spilling connection state into the subprocesses. + sparkSession.stop() + + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_starargs, args)) + + elif method == 'dbsql' and cursor is not None: + for start in range(0, nrows, batch_size): + log.warning(f'batch {start}') + end = min(start + batch_size, nrows) + fetch_data( + method, + cursor, + sparkSession, + start, + end, + order_by, + tablename, + columns_str, + json_output_folder, + ) + + if cursor is not None: + cursor.close() + + +def validate_and_get_cluster_info( + cluster_id: Optional[str], + databricks_host: str, + databricks_token: str, + http_path: Optional[str], + use_serverless: bool = False, +) -> tuple: + """Validate and get cluster info for running the Delta to JSONL conversion. + + Args: + cluster_id (str): cluster id to validate and fetch additional info for + databricks_host (str): databricks host name + databricks_token (str): databricks auth token + http_path (Optional[str]): http path to use for sql connect + use_serverless (bool): whether to use serverless or not + """ + method = 'dbsql' + dbsql = None + sparkSession = None + + if use_serverless: + method = 'dbconnect' + else: + if not cluster_id: + raise ValueError( + 'cluster_id is not set, however use_serverless is False', + ) + from databricks.sdk import WorkspaceClient + w = WorkspaceClient() + res = w.clusters.get(cluster_id=cluster_id) + if res is None: + raise ClusterDoesNotExistError(cluster_id) + + assert res.spark_version is not None + stripped_runtime = re.sub( + r'[a-zA-Z]', + '', + res.spark_version.split('-scala') + [0].replace( # type: ignore + 'x-snapshot', '', + ), + ) + runtime_version = re.sub(r'[.-]*$', '', stripped_runtime) + if version.parse( + runtime_version, + ) < version.parse(MINIMUM_SQ_CONNECT_DBR_VERSION): + raise ValueError( + f'The minium DBR version required is {MINIMUM_SQ_CONNECT_DBR_VERSION} but got {version.parse(runtime_version)}', + ) + + if http_path is None and version.parse( + runtime_version, + ) >= version.parse(MINIMUM_DB_CONNECT_DBR_VERSION): + method = 'dbconnect' + + if method == 'dbconnect': + from databricks.connect import DatabricksSession + try: + if use_serverless: + session_id = str(uuid4()) + sparkSession = DatabricksSession.builder.host( + databricks_host, + ).token( + databricks_token, + ).header('x-databricks-session-id', session_id).getOrCreate() + + else: + if not cluster_id: + raise ValueError('cluster_id is needed for dbconnect.',) + sparkSession = DatabricksSession.builder.remote( + host=databricks_host, + token=databricks_token, + cluster_id=cluster_id, + ).getOrCreate() + + except Exception as e: + raise FailedToConnectToDatabricksError() from e + else: + try: + from databricks import sql + dbsql = sql.connect( + server_hostname=re.compile(r'^https?://').sub( + '', databricks_host).strip( + ), # sqlconnect hangs if hostname starts with https + http_path=http_path, + access_token=databricks_token, + ) + except Exception as e: + raise FailedToCreateSQLConnectionError() from e + return method, dbsql, sparkSession + + +def fetch_DT( + delta_table_name: str, + json_output_folder: str, + http_path: Optional[str], + cluster_id: Optional[str], + use_serverless: bool, + DATABRICKS_HOST: str, + DATABRICKS_TOKEN: str, + batch_size: int = 1 << 30, + processes: int = os.cpu_count(), # type: ignore + json_output_filename: str = 'train-00000-of-00001.jsonl', +) -> None: + """Fetch UC Delta Table to local as jsonl.""" + log.info(f'Start .... Convert delta to json') + + obj = urllib.parse.urlparse(json_output_folder) + if obj.scheme != '': + raise ValueError( + 'Check the json_output_folder and verify it is a local path!', + ) + + if os.path.exists(json_output_folder): + if not os.path.isdir(json_output_folder) or os.listdir( + json_output_folder, + ): + raise RuntimeError( + f'Output folder {json_output_folder} already exists and is not empty. Please remove it and retry.', + ) + + os.makedirs(json_output_folder, exist_ok=True) + + if not json_output_filename.endswith('.jsonl'): + raise ValueError('json_output_filename needs to be a jsonl file') + + log.info(f'Directory {json_output_folder} created.') + + # validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True + method, dbsql, sparkSession = validate_and_get_cluster_info( + cluster_id=cluster_id, + databricks_host=DATABRICKS_HOST, + databricks_token=DATABRICKS_TOKEN, + http_path=http_path, + use_serverless=use_serverless, + ) + + formatted_delta_table_name = format_tablename(delta_table_name) + + fetch( + method, + formatted_delta_table_name, + json_output_folder, + batch_size, + processes, + sparkSession, + dbsql, + ) + + if dbsql is not None: + dbsql.close() + + # combine downloaded jsonl into one big jsonl for IFT + iterative_combine_jsons( + json_output_folder, + os.path.join(json_output_folder, json_output_filename), + ) + + +def _check_imports(): + try: + import lz4.frame + _ = lz4.frame + except ImportError as e: + raise ImportError('lz4 is not installed.') from e + + try: + from databricks.connect import DatabricksSession + _ = DatabricksSession + except ImportError as e: + raise ImportError( + 'databricks-connect is not installed or improperly configured.', + ) from e + + try: + from databricks import sql + from databricks.sdk import WorkspaceClient + from databricks.sql.client import Connection as Connection + from databricks.sql.client import Cursor as Cursor + _ = WorkspaceClient, Connection, Cursor, sql + except ImportError as e: + raise ImportError( + 'databricks-sdk is not installed or improperly configured.', + ) from e + + try: + import pyspark.sql.connect.proto as pb2 + import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2 + from pyspark.sql import SparkSession + from pyspark.sql.connect.client.core import SparkConnectClient + from pyspark.sql.connect.client.reattach import \ + ExecutePlanResponseReattachableIterator + from pyspark.sql.connect.dataframe import DataFrame + from pyspark.sql.dataframe import DataFrame as SparkDataFrame + from pyspark.sql.types import Row + _ = ( + pb2, + cloud_pb2, + SparkSession, + SparkConnectClient, + ExecutePlanResponseReattachableIterator, + DataFrame, + SparkDataFrame, + Row, + ) + except ImportError as e: + raise ImportError( + 'pyspark is not installed or improperly configured.', + ) from e + + +def convert_delta_to_json_from_args( + delta_table_name: str, + json_output_folder: str, + http_path: Optional[str], + cluster_id: Optional[str], + use_serverless: bool, + batch_size: int, + processes: int, + json_output_filename: str, +) -> None: + """A wrapper for `convert_dataset_json` that parses arguments. + + Args: + delta_table_name (str): UC table ..
+ json_output_folder (str): Local path to save the converted json + http_path (Optional[str]): If set, dbsql method is used + batch_size (int): Row chunks to transmit a time to avoid OOM + processes (int): Number of processes allowed to use + cluster_id (Optional[str]): Cluster ID with runtime newer than 14.1.0 and access mode of either assigned or shared can use databricks-connect. + use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless + json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl + """ + _check_imports() + from databricks.sdk import WorkspaceClient + w = WorkspaceClient() + DATABRICKS_HOST = w.config.host + DATABRICKS_TOKEN = w.config.token + + tik = time.time() + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + batch_size=batch_size, + processes=processes, + cluster_id=cluster_id, + use_serverless=use_serverless, + json_output_filename=json_output_filename, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + ) + log.info(f'Elapsed time {time.time() - tik}') diff --git a/scripts/data_prep/convert_delta_to_json.py b/scripts/data_prep/convert_delta_to_json.py index 3b88ba668f..277a8c1ffc 100644 --- a/scripts/data_prep/convert_delta_to_json.py +++ b/scripts/data_prep/convert_delta_to_json.py @@ -4,41 +4,12 @@ import logging import os import re -import time -import urllib.parse -from argparse import ArgumentParser, Namespace -from collections import namedtuple -from concurrent.futures import ProcessPoolExecutor -from typing import Iterable, List, Optional, Tuple, Union -from uuid import uuid4 +from argparse import ArgumentParser -import google.protobuf.any_pb2 as any_pb2 -import lz4.frame -import pandas as pd -import pyarrow as pa -import pyspark.sql.connect.proto as pb2 -import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2 -import requests -from composer.utils import retry -from databricks import sql -from databricks.connect import DatabricksSession -from databricks.sdk import WorkspaceClient from databricks.sql.client import Connection as Connection from databricks.sql.client import Cursor as Cursor -from packaging import version -from pyspark.sql import SparkSession -from pyspark.sql.connect.client.core import SparkConnectClient -from pyspark.sql.connect.client.reattach import \ - ExecutePlanResponseReattachableIterator -from pyspark.sql.connect.dataframe import DataFrame -from pyspark.sql.dataframe import DataFrame as SparkDataFrame -from pyspark.sql.types import Row -from llmfoundry.utils.exceptions import ( - ClusterDoesNotExistError, - FailedToConnectToDatabricksError, - FailedToCreateSQLConnectionError, -) +from llmfoundry.command_utils import convert_delta_to_json_from_args MINIMUM_DB_CONNECT_DBR_VERSION = '14.1' MINIMUM_SQ_CONNECT_DBR_VERSION = '12.2' @@ -47,617 +18,6 @@ log = logging.getLogger(__name__) -Result = namedtuple( - 'Result', - [ - 'url', - 'row_count', - 'compressed_size', - 'uncompressed_size', - ], -) # pyright: ignore - -# ``collect_as_cf`` is an addon new feature monkey patch on top of the DB Connect package. -# It allows the client to fetch the results in different formats from the server. -# To be able to use the code make sure this module is not overriden by DB Connect classes. - - -def to_cf(self: SparkConnectClient, - plan: pb2.Plan, - type: str = 'json') -> Tuple[List[Result], int, bool]: - """Executes the query plans and return as presigned URLS for cloud fetch. - - It can handle the current output formats that are supported by the server. - In contrast to the regular API methods of the client, this method does not - return the schema and drops all other responses. - - Args: - plan (pb2.Plan): The plan object to be executed by spark. - type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. - - Returns: - Tuple[List[Result], int, bool]: A tuple containing: - - A list of Result namedtuples, each containing a URL, row count, compressed size, - and uncompressed size of the part of the result. - - Total row count of all parts of the result. - - A boolean indicating whether the result has been truncated. - """ - log.info(f'Executing query plan with format: {type}') - - req = self._execute_plan_request_with_metadata() - req.plan.CopyFrom(plan) - - # Add the request options - if type == 'json': - format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_JSON - elif type == 'csv': - format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_CSV - elif type == 'arrow': - format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_ARROW - else: - raise ValueError( - f'Only formats json, csv, and arrow are supported. Got invalid type {type}', - ) - - ro = cloud_pb2.ResultOptions( - type=cloud_pb2.ResultOptions.TYPE_CLOUD, - cloudOptions=cloud_pb2.ResultOptions.CloudOptions( - format=format, - useCompression=False, - ), - ) - cloud_option = any_pb2.Any() - cloud_option.Pack(ro) - req.request_options.append( - pb2.ExecutePlanRequest.RequestOption(extension=cloud_option), - ) - - # Create the iterator - iterator = ExecutePlanResponseReattachableIterator( - req, - self._stub, - self._retry_policy, - self._builder.metadata(), - ) - # Iterate over the response - result = [] - row_count = 0 - is_overflow = False - - for response in iterator: - if response.HasField('extension') and response.extension.Is( - cloud_pb2.CloudResultBatch.DESCRIPTOR, - ): - batch = cloud_pb2.CloudResultBatch() - if not response.extension.Is(cloud_pb2.CloudResultBatch.DESCRIPTOR): - raise ValueError( - 'Response extension is not of type CloudResultBatch.', - ) - response.extension.Unpack(batch) - result += [ - Result( - b.url, - b.row_count, - b.compressed_size, - b.uncompressed_size, - ) for b in batch.results - ] - row_count += sum(result.row_count for result in batch.results) - is_overflow |= batch.truncated - return result, row_count, is_overflow - - -SparkConnectClient.to_cf = to_cf # pyright: ignore - - -def collect_as_cf(self: DataFrame, - type: str = 'json') -> Tuple[List[Result], int, bool]: - """Collects DataFrame execution plan as presigned URLs. - - This method is a wrapper around the `to_cf` method of SparkConnectClient. It takes the - execution plan of the current DataFrame, converts it to a protocol buffer format, and then - uses the `to_cf` method to execute the plan and fetch results as presigned URLs. - - Args: - type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. - - Returns: - Tuple[List[Result], int, bool]: A tuple containing: - - A list of Result namedtuples, each containing a URL, row count, compressed size, - and uncompressed size of the part of the result. - - Total row count of all parts of the result. - - A boolean indicating whether the result is truncated or overflowed. - """ - log.info(f'Collecting DataFrame as cloud fetch with format: {type}') - query = self._plan.to_proto(self._session.client) # pyright: ignore - return self._session.client.to_cf(query, type) # pyright: ignore - - -DataFrame.collect_cf = collect_as_cf # pyright: ignore - - -def iterative_combine_jsons(json_directory: str, output_file: str) -> None: - """Combine jsonl files in json_directory into one big jsonl file. - - This function does not work for nested subdirectories. - - Args: - json_directory(str): directory containing the JSONL files - output_file(str): path to the output combined JSONL file - """ - log.info( - f'Starting to combine JSON files from {json_directory} into {output_file}', - ) - json_files = [f for f in os.listdir(json_directory) if f.endswith('.jsonl')] - log.info(f'Found {len(json_files)} JSON files to combine') - with open(output_file, 'w') as outfile: - for file_name in json_files: - log.debug(f'Processing file: {file_name}') - with open(os.path.join(json_directory, file_name), 'r') as infile: - for line in infile: - outfile.write(line) - log.info('JSON files have been successfully combined into a JSONL file.') - - -def run_query( - query: str, - method: str, - cursor: Optional[Cursor] = None, - spark: Optional[SparkSession] = None, - collect: bool = True, -) -> Optional[Union[List[Row], DataFrame, SparkDataFrame]]: - """Run SQL query via databricks-connect or databricks-sql. - - Args: - query (str): sql query - method (str): select from dbsql and dbconnect - cursor (Optional[Cursor]): connection.cursor - spark (Optional[SparkSession]): spark session - collect (bool): whether to get the underlying data from spark dataframe - """ - log.info(f'Executing query using method: {method}') - log.debug(f'Query: {query}') - - if method == 'dbsql': - if cursor is None: - raise ValueError(f'cursor cannot be None if using method dbsql') - cursor.execute(query) - if collect: - return cursor.fetchall() - elif method == 'dbconnect': - if spark == None: - raise ValueError(f'sparkSession is required for dbconnect') - df = spark.sql(query) - if collect: - return df.collect() - return df - else: - raise ValueError(f'Unrecognized method: {method}') - - -def get_args(signed: List, json_output_folder: str, columns: List) -> Iterable: - for i, r in enumerate(signed): - yield (i, r.url, json_output_folder, columns) - - -def download( - ipart: int, - url: str, - json_output_folder: str, - columns: Optional[List] = None, - resp_format: str = 'arrow', - compressed: bool = False, -) -> None: - """Thread download presigned url and save to jsonl locally. - - Args: - ipart (int): presigned url id - url (str): presigned url - json_output_folder (str): directory to save the ipart_th segment of dataframe - columns (list): schema to save to json - resp_format (str): whether to use arrow or json when collect - compressed (bool): if data is compressed before downloading. Need decompress if compressed=True. - """ - log.info(f'Downloading part {ipart} from URL: {url}') - - resp = requests.get(url) - if resp.status_code == 200: - if resp_format == 'json': - data = resp.json() - pd.DataFrame(data, columns=columns).to_json( - os.path.join( - json_output_folder, - 'part_' + str(ipart) + '.jsonl', - ), - orient='records', - lines=True, - ) - return - - # When resp_format is arrow: - if compressed: - # The data is lz4 compressed arrow format. - # Decompress the data - decompressed_data = lz4.frame.decompress(resp.content) - # Convert the decompressed data into a PyArrow table - reader = pa.ipc.open_stream(decompressed_data) - else: - reader = pa.ipc.open_stream(resp.content) - table = reader.read_all() - - # Convert the PyArrow table into a pandas DataFrame - df = table.to_pandas() - df.to_json( - os.path.join(json_output_folder, 'part_' + str(ipart) + '.jsonl'), - orient='records', - lines=True, - force_ascii=False, - ) - - -def download_starargs(args: Tuple) -> None: - return download(*args) - - -def format_tablename(table_name: str) -> str: - """Escape catalog, schema and table names with backticks. - - This needs to be done when running SQL queries/setting spark sessions to prevent invalid identifier errors. - - Args: - table_name (str): catalog.scheme.tablename on UC - """ - log.debug(f'Formatting table name: {table_name}') - match = re.match(TABLENAME_PATTERN, table_name) - - if match is None: - return table_name - - formatted_identifiers = [] - for i in range(1, 4): - identifier = f'`{match.group(i)}`' - formatted_identifiers.append(identifier) - - return '.'.join(formatted_identifiers) - - -def fetch_data( - method: str, - cursor: Optional[Cursor], - sparkSession: Optional[SparkSession], - start: int, - end: int, - order_by: str, - tablename: str, - columns_str: str, - json_output_folder: str, -) -> None: - """Fetches a specified range of rows from a given table to a json file. - - This function executes a SQL query to retrieve a range of rows, determined by 'start' and 'end' indexes, - from a specified table and column set. The fetched data is then exported as a JSON file. - - Args: - method (str): The method to use for fetching data, either 'dbconnect' or 'dbsql'. - cursor (Optional[Cursor]): The cursor object for executing queries in 'dbsql' method. - sparkSession (Optional[SparkSession]): The Spark session object for executing queries in 'dbconnect' method. - start (int): The starting index for row fetching. - end (int): The ending index for row fetching. - order_by (str): The column name to use for ordering the rows. - tablename (str): The name of the table from which to fetch the data. - columns_str (str): The string representation of the columns to select from the table. - json_output_folder (str): The file path where the resulting JSON file will be saved. - - Returns: - None: The function doesn't return any value, but writes the result to a JSONL file. - """ - log.info(f'Fetching data from {start} to {end} using method: {method}') - query = f""" - WITH NumberedRows AS ( - SELECT - *, - ROW_NUMBER() OVER (ORDER BY {order_by}) AS rn - FROM - {tablename} - ) - SELECT {columns_str} - FROM NumberedRows - WHERE rn BETWEEN {start+1} AND {end}""" - - if method == 'dbconnect': - spark_df = run_query(query, method, cursor, sparkSession, collect=False) - if spark_df is None: - raise RuntimeError( - f'Expect spark dataframe with {query} but got None', - ) - pdf = spark_df.toPandas() # pyright: ignore - else: # method == 'dbsql': - ans = run_query(query, method, cursor, sparkSession, collect=True) - if ans is None: - raise RuntimeError(f'Got empty results with {query}') - records = [r.asDict() for r in ans] # pyright: ignore - pdf = pd.DataFrame.from_dict(records) - - pdf.to_json( - os.path.join(json_output_folder, f'part_{start+1}_{end}.jsonl'), - orient='records', - lines=True, - ) - - -@retry(Exception, num_attempts=5, initial_backoff=1.0, max_jitter=0.5) -def get_total_rows( - tablename: str, - method: str, - cursor: Optional[Cursor], - sparkSession: Optional[SparkSession], -): - ans = run_query( - f'SELECT COUNT(*) FROM {tablename}', - method, - cursor, - sparkSession, - ) - nrows = [row.asDict() for row in ans][0].popitem()[1] # pyright: ignore - log.info(f'total_rows = {nrows}') - return nrows - - -@retry(Exception, num_attempts=5, initial_backoff=1.0, max_jitter=0.5) -def get_columns_info( - tablename: str, - method: str, - cursor: Optional[Cursor], - sparkSession: Optional[SparkSession], -): - ans = run_query( - f'SHOW COLUMNS IN {tablename}', - method, - cursor, - sparkSession, - ) - columns = [row.asDict().popitem()[1] for row in ans] # pyright: ignore - order_by = columns[0] - columns_str = ','.join(columns) - log.info(f'order by column {order_by}') - return columns, order_by, columns_str - - -def fetch( - method: str, - tablename: str, - json_output_folder: str, - batch_size: int = 1 << 30, - processes: int = 1, - sparkSession: Optional[SparkSession] = None, - dbsql: Optional[Connection] = None, -) -> None: - """Fetch UC delta table with databricks-connect as JSONL. - - Args: - method (str): dbconnect or dbsql - tablename (str): catalog.scheme.tablename on UC - json_output_folder (str): path to write the result json file to - batch_size (int): number of rows that dbsql fetches each time to avoid OOM - processes (int): max number of processes to use to parallelize the fetch - sparkSession (pyspark.sql.sparksession): spark session - dbsql (databricks.sql.connect): dbsql session - """ - log.info(f'Starting data fetch for table: {tablename}') - log.info( - f'Method: {method}, Batch size: {batch_size}, Processes: {processes}', - ) - - cursor = dbsql.cursor() if dbsql is not None else None - try: - nrows = get_total_rows( - tablename, - method, - cursor, - sparkSession, - ) - except Exception as e: - raise RuntimeError( - f'Error in get rows from {tablename}. Restart sparkSession and try again', - ) from e - - try: - columns, order_by, columns_str = get_columns_info( - tablename, - method, - cursor, - sparkSession, - ) - except Exception as e: - raise RuntimeError( - f'Error in get columns from {tablename}. Restart sparkSession and try again', - ) from e - - if method == 'dbconnect' and sparkSession is not None: - log.info(f'{processes=}') - df = sparkSession.table(tablename) - - # Running the query and collecting the data as arrow or json. - signed, _, _ = df.collect_cf('arrow') # pyright: ignore - log.info(f'len(signed) = {len(signed)}') - - args = get_args(signed, json_output_folder, columns) - - # Stopping the SparkSession to avoid spilling connection state into the subprocesses. - sparkSession.stop() - - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_starargs, args)) - - elif method == 'dbsql' and cursor is not None: - for start in range(0, nrows, batch_size): - log.warning(f'batch {start}') - end = min(start + batch_size, nrows) - fetch_data( - method, - cursor, - sparkSession, - start, - end, - order_by, - tablename, - columns_str, - json_output_folder, - ) - - if cursor is not None: - cursor.close() - - -def validate_and_get_cluster_info( - cluster_id: str, - databricks_host: str, - databricks_token: str, - http_path: Optional[str], - use_serverless: bool = False, -) -> tuple: - """Validate and get cluster info for running the Delta to JSONL conversion. - - Args: - cluster_id (str): cluster id to validate and fetch additional info for - databricks_host (str): databricks host name - databricks_token (str): databricks auth token - http_path (Optional[str]): http path to use for sql connect - use_serverless (bool): whether to use serverless or not - """ - log.info('Validating cluster information and getting connection details') - log.debug( - f'Cluster ID: {cluster_id}, Host: {databricks_host}, Use Serverless: {use_serverless}', - ) - - method = 'dbsql' - dbsql = None - sparkSession = None - - if use_serverless: - method = 'dbconnect' - else: - w = WorkspaceClient() - res = w.clusters.get(cluster_id=cluster_id) - if res is None: - raise ClusterDoesNotExistError(cluster_id) - - assert res.spark_version is not None - stripped_runtime = re.sub( - r'[a-zA-Z]', - '', - res.spark_version.split('-scala') - [0].replace( # type: ignore - 'x-snapshot', '', - ), - ) - runtime_version = re.sub(r'[.-]*$', '', stripped_runtime) - if version.parse( - runtime_version, - ) < version.parse(MINIMUM_SQ_CONNECT_DBR_VERSION): - raise ValueError( - f'The minium DBR version required is {MINIMUM_SQ_CONNECT_DBR_VERSION} but got {version.parse(runtime_version)}', - ) - - if http_path is None and version.parse( - runtime_version, - ) >= version.parse(MINIMUM_DB_CONNECT_DBR_VERSION): - method = 'dbconnect' - - if method == 'dbconnect': - try: - if use_serverless: - session_id = str(uuid4()) - sparkSession = DatabricksSession.builder.host( - databricks_host, - ).token( - databricks_token, - ).header('x-databricks-session-id', session_id).getOrCreate() - - else: - sparkSession = DatabricksSession.builder.remote( - host=databricks_host, - token=databricks_token, - cluster_id=cluster_id, - ).getOrCreate() - - except Exception as e: - raise FailedToConnectToDatabricksError() from e - else: - try: - dbsql = sql.connect( - server_hostname=re.compile(r'^https?://').sub( - '', databricks_host).strip( - ), # sqlconnect hangs if hostname starts with https - http_path=http_path, - access_token=databricks_token, - ) - except Exception as e: - raise FailedToCreateSQLConnectionError() from e - return method, dbsql, sparkSession - - -def fetch_DT(args: Namespace) -> None: - """Fetch UC Delta Table to local as jsonl.""" - log.info(f'Start .... Convert delta to json') - log.info('Starting Delta Table to JSON conversion process') - log.info(f'Delta Table: {args.delta_table_name}') - log.info(f'Output Folder: {args.json_output_folder}') - log.info(f'Output Filename: {args.json_output_filename}') - - obj = urllib.parse.urlparse(args.json_output_folder) - if obj.scheme != '': - raise ValueError( - 'Check the json_output_folder and verify it is a local path!', - ) - - if os.path.exists(args.json_output_folder): - if not os.path.isdir(args.json_output_folder) or os.listdir( - args.json_output_folder, - ): - raise RuntimeError( - f'Output folder {args.json_output_folder} already exists and is not empty. Please remove it and retry.', - ) - - os.makedirs(args.json_output_folder, exist_ok=True) - - if not args.json_output_filename.endswith('.jsonl'): - raise ValueError('json_output_filename needs to be a jsonl file') - - log.info(f'Directory {args.json_output_folder} created.') - - method, dbsql, sparkSession = validate_and_get_cluster_info( - cluster_id=args.cluster_id, - databricks_host=args.DATABRICKS_HOST, - databricks_token=args.DATABRICKS_TOKEN, - http_path=args.http_path, - use_serverless=args.use_serverless, - ) - - args.delta_table_name = format_tablename(args.delta_table_name) - - fetch( - method, - args.delta_table_name, - args.json_output_folder, - args.batch_size, - args.processes, - sparkSession, - dbsql, - ) - - if dbsql is not None: - dbsql.close() - - # combine downloaded jsonl into one big jsonl for IFT - iterative_combine_jsons( - args.json_output_folder, - os.path.join(args.json_output_folder, args.json_output_filename), - ) - - log.info('Delta Table to JSON conversion completed successfully') - - if __name__ == '__main__': parser = ArgumentParser( description= @@ -719,11 +79,13 @@ def fetch_DT(args: Namespace) -> None: 'The name of the combined final jsonl that combines all partitioned jsonl', ) args = parser.parse_args() - w = WorkspaceClient() - args.DATABRICKS_HOST = w.config.host - args.DATABRICKS_TOKEN = w.config.token - - tik = time.time() - fetch_DT(args) - log.info(f'Elapsed time {time.time() - tik}') - log.info('Delta Table to JSON conversion script completed') + convert_delta_to_json_from_args( + delta_table_name=args.delta_table_name, + json_output_folder=args.json_output_folder, + http_path=args.http_path, + batch_size=args.batch_size, + processes=args.processes, + cluster_id=args.cluster_id, + use_serverless=args.use_serverless, + json_output_filename=args.json_output_filename, + ) diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index 83d6edeca2..e623467bf7 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -1,15 +1,12 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -# copyright 2022 mosaicml llm foundry authors -# spdx-license-identifier: apache-2.0 - import unittest from argparse import Namespace from typing import Any from unittest.mock import MagicMock, mock_open, patch -from scripts.data_prep.convert_delta_to_json import ( +from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( download, fetch_DT, format_tablename, @@ -20,11 +17,19 @@ class TestConvertDeltaToJsonl(unittest.TestCase): - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sdk.WorkspaceClient', + ) def test_stream_delta_to_json( self, mock_workspace_client: Any, @@ -33,19 +38,15 @@ def test_stream_delta_to_json( mock_makedirs: Any, mock_sql_connect: Any, ): - - args = MagicMock() - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' - args.DATABRICKS_HOST = 'test_host' - args.DATABRICKS_TOKEN = 'test_token' - args.http_path = 'test_path' - args.batch_size = 1000 - args.partitions = 1 - args.cluster_id = '1234' - args.debug = False - args.use_serverless = False - args.json_output_filename = 'combined.jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' + DATABRICKS_HOST = 'test_host' + DATABRICKS_TOKEN = 'test_token' + http_path = 'test_path' + batch_size = 1000 + cluster_id = '1234' + use_serverless = False + json_output_filename = 'combined.jsonl' mock_cluster_get = MagicMock() mock_cluster_get.return_value = MagicMock( @@ -53,7 +54,17 @@ def test_stream_delta_to_json( ) mock_workspace_client.return_value.clusters.get = mock_cluster_get - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + batch_size=batch_size, + json_output_filename=json_output_filename, + ) mock_sql_connect.assert_called_once_with( server_hostname='test_host', http_path='test_path', @@ -66,7 +77,9 @@ def test_stream_delta_to_json( '/path/to/jsonl/combined.jsonl', ) - @patch('scripts.data_prep.convert_delta_to_json.os.listdir') + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.listdir', + ) @patch( 'builtins.open', new_callable=mock_open, @@ -102,7 +115,9 @@ def test_iterative_combine_jsons(self, mock_file: Any, mock_listdir: Any): """ self.assertEqual(mock_file().write.call_count, 2) - @patch('scripts.data_prep.convert_delta_to_json.SparkSession') + @patch( + 'pyspark.sql.SparkSession', + ) def test_run_query_dbconnect(self, mock_spark: Any): method = 'dbconnect' mock_cursor = None @@ -118,7 +133,9 @@ def test_run_query_dbconnect(self, mock_spark: Any): mock_spark.sql.assert_called_once_with('SELECT * FROM table') self.assertEqual(result, 'result') - @patch('scripts.data_prep.convert_delta_to_json.Cursor') + @patch( + 'databricks.sql.client.Cursor', + ) def test_run_query_dbsql(self, mock_cursor: Any): method = 'dbsql' mock_cursor.fetchall.return_value = 'result' @@ -134,14 +151,18 @@ def test_run_query_dbsql(self, mock_cursor: Any): mock_cursor.execute.assert_called_once_with('SELECT * FROM table') self.assertEqual(result, 'result') - @patch('scripts.data_prep.convert_delta_to_json.requests.get') - @patch('scripts.data_prep.convert_delta_to_json.pd.DataFrame.to_json') @patch( - 'scripts.data_prep.convert_delta_to_json.os.path.join', + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.requests.get', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.pd.DataFrame.to_json', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.path.join', return_value='/fake/path/part_1.jsonl', ) @patch( - 'scripts.data_prep.convert_delta_to_json.time.sleep', + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.time.sleep', ) # Mock sleep to speed up the test def test_download_success( self, @@ -174,12 +195,22 @@ def test_download_success( mock_get.assert_called_once_with('http://fakeurl.com/data') - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'databricks.connect.DatabricksSession', + ) + @patch( + 'databricks.sdk.WorkspaceClient', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') def test_dbconnect_called( self, mock_fetch: Any, @@ -189,17 +220,14 @@ def test_dbconnect_called( mock_databricks_session: Any, mock_sql_connect: Any, ): - - args = MagicMock() - - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) - args.http_path = None - args.cluster_id = '1234' - args.DATABRICKS_HOST = 'host' - args.DATABRICKS_TOKEN = 'token' - args.use_serverless = False + http_path = None + cluster_id = '1234' + DATABRICKS_HOST = 'host' + DATABRICKS_TOKEN = 'token' + use_serverless = False mock_cluster_response = Namespace(spark_version='14.1.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response @@ -209,19 +237,37 @@ def test_dbconnect_called( ) # Mock return value for getOrCreate mock_databricks_session.builder.remote.return_value = mock_remote - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + ) mock_databricks_session.builder.remote.assert_called_once_with( - host=args.DATABRICKS_HOST, - token=args.DATABRICKS_TOKEN, - cluster_id=args.cluster_id, + host=DATABRICKS_HOST, + token=DATABRICKS_TOKEN, + cluster_id=cluster_id, ) - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'databricks.connect.DatabricksSession', + ) + @patch( + 'databricks.sdk.WorkspaceClient', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') def test_sqlconnect_called_dbr13( self, mock_fetch: Any, @@ -231,34 +277,49 @@ def test_sqlconnect_called_dbr13( mock_databricks_session: Any, mock_sql_connect: Any, ): - - args = MagicMock() - - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) - args.http_path = 'test_path' - args.cluster_id = '1234' - args.DATABRICKS_HOST = 'host' - args.DATABRICKS_TOKEN = 'token' - args.use_serverless = False + http_path = 'test_path' + cluster_id = '1234' + DATABRICKS_HOST = 'host' + DATABRICKS_TOKEN = 'token' + use_serverless = False mock_cluster_response = Namespace(spark_version='13.0.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + ) mock_sql_connect.assert_called_once_with( - server_hostname=args.DATABRICKS_HOST, - http_path=args.http_path, - access_token=args.DATABRICKS_TOKEN, + server_hostname=DATABRICKS_HOST, + http_path=http_path, + access_token=DATABRICKS_TOKEN, ) - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'databricks.connect.DatabricksSession', + ) + @patch( + 'databricks.sdk.WorkspaceClient', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') def test_sqlconnect_called_dbr14( self, mock_fetch: Any, @@ -268,34 +329,49 @@ def test_sqlconnect_called_dbr14( mock_databricks_session: Any, mock_sql_connect: Any, ): - - args = MagicMock() - - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) - args.http_path = 'test_path' - args.cluster_id = '1234' - args.DATABRICKS_HOST = 'host' - args.DATABRICKS_TOKEN = 'token' - args.use_serverless = False + http_path = 'test_path' + cluster_id = '1234' + DATABRICKS_HOST = 'host' + DATABRICKS_TOKEN = 'token' + use_serverless = False mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + ) mock_sql_connect.assert_called_once_with( - server_hostname=args.DATABRICKS_HOST, - http_path=args.http_path, - access_token=args.DATABRICKS_TOKEN, + server_hostname=DATABRICKS_HOST, + http_path=http_path, + access_token=DATABRICKS_TOKEN, ) - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'databricks.connect.DatabricksSession', + ) + @patch( + 'databricks.sdk.WorkspaceClient', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') def test_sqlconnect_called_https( self, mock_fetch: Any, @@ -305,34 +381,49 @@ def test_sqlconnect_called_https( mock_databricks_session: Any, mock_sql_connect: Any, ): - - args = MagicMock() - - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) - args.http_path = 'test_path' - args.cluster_id = '1234' - args.DATABRICKS_HOST = 'https://test-host' - args.DATABRICKS_TOKEN = 'token' - args.use_serverless = False + http_path = 'test_path' + cluster_id = '1234' + DATABRICKS_HOST = 'https://test-host' + DATABRICKS_TOKEN = 'token' + use_serverless = False mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + ) mock_sql_connect.assert_called_once_with( server_hostname='test-host', - http_path=args.http_path, - access_token=args.DATABRICKS_TOKEN, + http_path=http_path, + access_token=DATABRICKS_TOKEN, ) - @patch('scripts.data_prep.convert_delta_to_json.sql.connect') - @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') - @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') - @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') - @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') - @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch( + 'databricks.sql.connect', + ) + @patch( + 'databricks.connect.DatabricksSession', + ) + @patch( + 'databricks.sdk.WorkspaceClient', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.os.makedirs', + ) + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.iterative_combine_jsons', + ) + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') def test_serverless( self, mock_fetch: Any, @@ -342,22 +433,27 @@ def test_serverless( mock_databricks_session: Any, mock_sql_connect: Any, ): - - args = MagicMock() - - args.delta_table_name = 'test_table' - args.json_output_folder = '/path/to/jsonl' + delta_table_name = 'test_table' + json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) - args.http_path = 'test_path' - args.cluster_id = '1234' - args.DATABRICKS_HOST = 'https://test-host' - args.DATABRICKS_TOKEN = 'token' - args.use_serverless = True + http_path = 'test_path' + cluster_id = '1234' + DATABRICKS_HOST = 'https://test-host' + DATABRICKS_TOKEN = 'token' + use_serverless = True mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response - fetch_DT(args) + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + use_serverless=use_serverless, + ) assert not mock_sql_connect.called assert not mock_databricks_session.builder.remote.called From 221d3e2bfa641d007b2c666dd0402d57de0593ff Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:16:04 -0700 Subject: [PATCH 28/57] Revert "Use utils to get shared fs safe signal file name (#1381)" (#1389) This reverts commit d2d29adad17ae1fc48a294dfd5c4fa4d3e63e809. --- llmfoundry/data/finetuning/dataloader.py | 55 +++++++++++++++--------- llmfoundry/data/finetuning/tasks.py | 2 +- llmfoundry/models/hf/hf_causal_lm.py | 2 +- llmfoundry/utils/builders.py | 2 +- 4 files changed, 38 insertions(+), 23 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 60052acdc5..11104ac706 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -534,27 +534,42 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - with dist.busy_wait_for_local_rank_zero(finetune_dir): - if dist.get_local_rank() == 0: - try: - get_file(path=name, destination=destination, overwrite=True) - except FileNotFoundError as e: - if extension == SUPPORTED_EXTENSIONS[-1]: - files_searched = [ - f'{name}/{split}{ext}' - for ext in SUPPORTED_EXTENSIONS - ] - raise FileNotFoundError( - f'Could not find a file with any of ' + \ - f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ - f'at {files_searched}', - ) from e - else: - log.debug( - f'Could not find {name}, looking for another extension', - ) - continue + signal_file_path = os.path.join( + finetune_dir, + f'.node_{dist.get_node_rank()}_local_rank0_completed', + ) + if dist.get_local_rank() == 0: + try: + get_file(path=name, destination=destination, overwrite=True) + except FileNotFoundError as e: + if extension == SUPPORTED_EXTENSIONS[-1]: + files_searched = [ + f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS + ] + raise FileNotFoundError( + f'Could not find a file with any of ' + \ + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ + f'at {files_searched}', + ) from e + else: + log.debug( + f'Could not find {name}, looking for another extension', + ) + continue + + os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) + with open(signal_file_path, 'wb') as f: + f.write(b'local_rank0_completed_download') + + # Avoid the collective call until the local rank zero has finished trying to download the dataset + # so that we don't timeout for large downloads. This syncs all processes on the node + with dist.local_rank_zero_download_and_wait(signal_file_path): + # Then, wait to ensure every node has finished trying to download the dataset + dist.barrier() + # clean up signal file + if dist.get_local_rank() == 0: + os.remove(signal_file_path) dist.barrier() break return finetune_dir diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index d5af632952..78bfb9c74c 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -827,7 +827,7 @@ def build_from_hf( Returns: Dataset: The tokenized dataset. """ - signal_file_path = dist.get_node_signal_file_name() + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed' # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing. # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 7c0baf0c58..071310d69e 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -333,7 +333,7 @@ def _autoset_attn_implementation_monkeypatch( f'init_device="{init_device}" must be either "cpu" or "meta".', ) - signal_file_path = dist.get_node_signal_file_name() + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed' if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index cf27e7660e..9f18c31ec6 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -497,7 +497,7 @@ def build_tokenizer( os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' - signal_file_path = dist.get_node_signal_file_name() + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' if dist.is_available() and dist.is_initialized( ) and dist.get_world_size() > 1: From 7b160fcca23ee5a5591704e22c76e4c24962e924 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:11:24 -0700 Subject: [PATCH 29/57] Avoid race condition in convert text to mds script (#1390) --- llmfoundry/command_utils/data_prep/convert_text_to_mds.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 14afe279fd..336c82a5e7 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -394,6 +394,13 @@ def convert_text_to_mds( reprocess (bool): Whether to always reprocess the given folder of text files trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer """ + # Load the tokenizer once on the main process so that the files are cached to avoid race conditions + # in the Hugging Face load code + AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + is_remote_output = is_remote_path(output_folder) log.info(f'Output is remote: {is_remote_output}') From ced63ee33198fff2610b9bb232ee0897ec0f966d Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 23 Jul 2024 14:54:11 -0700 Subject: [PATCH 30/57] Refactor loss function for ComposerMPTCausalLM (#1387) --- llmfoundry/models/mpt/modeling_mpt.py | 64 +++++++++++++++++---------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 3b2744f867..40b3aaa6ee 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -1285,6 +1285,40 @@ def _reorder_cache( return reordered_past +def get_targets(labels: torch.Tensor) -> torch.Tensor: + targets = torch.roll(labels, shifts=-1) + targets[:, -1] = -100 + return targets + + +def compute_loss_from_logits( + outputs: CausalLMOutputWithPast, + shift_labels: bool, + labels: torch.Tensor, + loss_fn: nn.Module, + sample_weighing_factor: Optional[torch.Tensor] = None, +) -> torch.Tensor: + targets = get_targets(labels) if shift_labels else labels + + losses = loss_fn( + outputs.logits.view(-1, outputs.logits.size(-1)), + targets.view(-1), + ) + + if torch.all(targets == loss_fn.ignore_index): + loss = losses.sum() + else: + loss = losses.sum() / (targets != loss_fn.ignore_index).sum() + if sample_weighing_factor is not None: + if sample_weighing_factor.shape[0] > 1: + raise ValueError( + 'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.', + ) + loss = loss * sample_weighing_factor[0].item() + + return loss + + class ComposerMPTCausalLM(HuggingFaceModel): def __init__( @@ -1362,9 +1396,7 @@ def config_class(self) -> Type[MPTConfig]: return MPTConfig def get_targets(self, batch: Mapping) -> torch.Tensor: - targets = torch.roll(batch['labels'], shifts=-1) - targets[:, -1] = -100 - return targets + return get_targets(batch['labels']) def forward(self, batch: MutableMapping) -> CausalLMOutputWithPast: if self.config.ffn_config['ffn_type'] in ffns_with_megablocks: @@ -1385,27 +1417,14 @@ def forward(self, batch: MutableMapping) -> CausalLMOutputWithPast: def loss(self, outputs: CausalLMOutputWithPast, batch: Mapping) -> Union[dict, torch.Tensor]: - if self.shift_labels: - targets = self.get_targets(batch) - else: - targets = batch['labels'] - - losses = self.loss_fn( - outputs.logits.view(-1, outputs.logits.size(-1)), - targets.view(-1), + loss = compute_loss_from_logits( + outputs, + self.shift_labels, + batch['labels'], + self.loss_fn, + batch.get('sample_weighing_factor', None), ) - if torch.all(targets == self.loss_fn.ignore_index): - loss = losses.sum() - else: - loss = losses.sum() / (targets != self.loss_fn.ignore_index).sum() - if 'sample_weighing_factor' in batch: - if batch['sample_weighing_factor'].shape[0] > 1: - raise ValueError( - 'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.', - ) - loss = loss * batch['sample_weighing_factor'][0].item() - if self.config.ffn_config['ffn_type'] in ffns_with_megablocks: # MegaBlocks MoE load balancing loss try: # Add try/catch to avoid transformers complaining and raising errors @@ -1420,7 +1439,6 @@ def loss(self, outputs: CausalLMOutputWithPast, 'loss': loss, 'lbl': lbl, } - return loss @cached_property From b636e5f45df2dee44f7d047fff8fcf697a2fa17f Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 23 Jul 2024 19:50:54 -0700 Subject: [PATCH 31/57] Revert "Allow for multiple workers when autopacking (#1375)" (#1392) This reverts commit d812f20c5472f771f06a9dd561a31d3a88ed26bf. --- llmfoundry/data/packing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 0c5cb1418b..a6fdf34953 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -424,6 +424,8 @@ def profile_packing( dataloader_cfg = copy.deepcopy(dataloader_cfg) dataloader_cfg.update({ 'drop_last': False, + 'num_workers': 0, + 'prefetch_factor': None, 'persistent_workers': False, }) dataloader_cfg['dataset']['packing_ratio'] = 1.0 From af37ea0443ae314997cdb45053ea75d3fb886daa Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 24 Jul 2024 10:08:29 -0700 Subject: [PATCH 32/57] bump transformers to 4.43.2 (#1393) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4563b4dfb8..6193fdc6b4 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24', 'mlflow>=2.14.1,<2.15', 'accelerate>=0.25,<0.33', # for HF inference `device_map` - 'transformers>=4.43.1,<4.44', + 'transformers>=4.43.2,<4.44', 'mosaicml-streaming>=0.7.6,<0.8', 'torch>=2.3.0,<2.4', 'datasets>=2.19,<2.20', From 70586c407d6ebb39c9f8c6bfe1380b0f1de4a15e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 24 Jul 2024 14:07:25 -0400 Subject: [PATCH 33/57] Support rope scaling (#1391) * support rope scaling * use rope scaling * update to use rope config * update config args * use allowlist for config to enforce hygeine * allow llama3 rope config * add unit test * documented allowed llama config keys * Update llmfoundry/models/mpt/modeling_mpt.py * Address comments 1 * Apply suggestions from code review Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * use same codepath for all the hf rotary embeddings * fix * update * test WIP but fix get/pop * change the thing being popped * give up on testing hf --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/models/mpt/configuration_mpt.py | 1 + llmfoundry/models/mpt/modeling_mpt.py | 98 +++++++++++++++------- tests/models/layers/test_flash_torch.py | 6 +- tests/models/test_rope_dail_vs_hf.py | 6 +- tests/models/test_rope_scaling.py | 35 ++++++++ 5 files changed, 111 insertions(+), 35 deletions(-) create mode 100644 tests/models/test_rope_scaling.py diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 3de3744745..8ac5a8ac49 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -306,6 +306,7 @@ def _validate_config(self) -> None: 'no_scaling', 'linear', 'dynamic', + 'llama3', ]: raise ValueError( 'If using hf implementation of rope, the type should be one of "no_scaling", "linear" or "dynamic".', diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 40b3aaa6ee..7dfaf8562b 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -49,10 +49,8 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from transformers.models.llama.modeling_llama import \ - LlamaDynamicNTKScalingRotaryEmbedding as HFDynamicNTKScalingRotaryEmbedding -from transformers.models.llama.modeling_llama import \ - LlamaLinearScalingRotaryEmbedding as HFLinearScalingRotaryEmbedding +from transformers.models.llama.modeling_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding from transformers.models.llama.modeling_llama import \ LlamaRotaryEmbedding as HFRotaryEmbedding @@ -88,14 +86,62 @@ log = logging.getLogger(__name__) +class InvalidConfigAccessError(KeyError): + pass + + +_ALLOWED_LLAMA_CONFIG_KEYS = { + # These are the only config keys that are set and are safe to read from + 'rope_scaling', + 'rope_theta', + 'max_position_embeddings', + 'hidden_size', + 'num_attention_heads', + + # Not set but llama modeling code tries to read this attribute + 'partial_rotary_factor', + + # Benign transformers attributes needed for __init__ + '_get_generation_defaults', + 'label2id', + 'id2label', + 'torch_dtype', + 'problem_type', + '__class__', +} + + +class PartialLlamaConfig(LlamaConfig): + """Holds the rope config for Llama models and throws. + + an `InvalidConfigAccessError` if any other config elements are read. This + class is necessary because the `LlamaRotaryEmbedding` class takes a full + `LlamaConfig` now instead of the old keyword arguments. + """ + + def __getattribute__(self, key: str): + if key not in _ALLOWED_LLAMA_CONFIG_KEYS: + raise InvalidConfigAccessError(key) + + return super().__getattribute__(key) + + def __getitem__(self, key: str): + if key not in _ALLOWED_LLAMA_CONFIG_KEYS: + raise InvalidConfigAccessError(key) + + return super().__getitem__(key) + + def gen_rotary_embedding( - rope_head_dim: int, rope_impl: str, rope_theta: int, rope_dail_config: dict, rope_hf_config: dict, max_seq_len: int, + d_model: int, + n_heads: int, ): + rope_head_dim = d_model // n_heads if rope_impl == 'dail': return DAILRotaryEmbedding( dim=rope_head_dim, @@ -108,32 +154,21 @@ def gen_rotary_embedding( 'cpu', # FSDP does not materialize modules with meta buffers, hence device is set to cpu ) elif rope_impl == 'hf': + llama_rope_config = {**rope_hf_config} + llama_rope_config['rope_type'] = llama_rope_config.pop('type') + if llama_rope_config['rope_type'] == 'no_scaling': + llama_rope_config['rope_type'] = 'default' + partial_llama_config = PartialLlamaConfig( + rope_scaling=llama_rope_config, + rope_theta=rope_theta, + max_position_embeddings=max_seq_len, + hidden_size=d_model, + num_attention_heads=n_heads, + ) if rope_hf_config['type'] == 'no_scaling': - return HFRotaryEmbeddingFoundry( - rope_head_dim, - max_position_embeddings=max_seq_len, - base=rope_theta, - device= - 'cpu', # FSDP does not materialize modules with meta buffers, hence device is set to cpu - ) - elif rope_hf_config['type'] == 'linear': - return HFLinearScalingRotaryEmbedding( - rope_head_dim, - max_position_embeddings=max_seq_len, - base=rope_theta, - scaling_factor=rope_hf_config['factor'], - device= - 'cpu', # FSDP does not materialize modules with meta buffers, hence device is set to cpu - ) - elif rope_hf_config['type'] == 'dynamic': - return HFDynamicNTKScalingRotaryEmbedding( - rope_head_dim, - max_position_embeddings=max_seq_len, - base=rope_theta, - scaling_factor=rope_hf_config['factor'], - device= - 'cpu', # FSDP does not materialize modules with meta buffers, hence device is set to cpu - ) + return HFRotaryEmbeddingFoundry(config=partial_llama_config) + elif rope_hf_config['type'] in {'llama3', 'linear', 'dynamic'}: + return LlamaRotaryEmbedding(config=partial_llama_config) raise ValueError('rope_impl needs to be either dail or hf') @@ -399,12 +434,13 @@ def __init__(self, config: MPTConfig): if self.rope: self.rope_impl = config.attn_config['rope_impl'] self.rotary_embedding = gen_rotary_embedding( - rope_head_dim=config.d_model // config.n_heads, rope_impl=self.rope_impl, rope_theta=config.attn_config['rope_theta'], rope_dail_config=config.attn_config['rope_dail_config'], rope_hf_config=config.attn_config['rope_hf_config'], max_seq_len=self.config.max_seq_len, + d_model=config.d_model, + n_heads=config.n_heads, ) if config.init_device != 'meta': diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py index 01d982052f..4bfdfb84dc 100644 --- a/tests/models/layers/test_flash_torch.py +++ b/tests/models/layers/test_flash_torch.py @@ -251,12 +251,13 @@ def gen_bias(attn_impl: str): rotary_emb_w_meta_info = None if rope: rotary_embedding = gen_rotary_embedding( - rope_head_dim=cfg.d_model // cfg.n_heads, rope_impl=pos_emb_config['rope_impl'], rope_theta=pos_emb_config['rope_theta'], rope_dail_config=pos_emb_config.get('rope_dail_config', {}), rope_hf_config=pos_emb_config.get('rope_hf_config', {}), max_seq_len=s, + d_model=cfg.d_model, + n_heads=cfg.n_heads, ).to(device) pos = torch.arange(s).unsqueeze(0).to(device=device) # adjust the position indices to account for padding tokens @@ -664,12 +665,13 @@ def gen_bias(attn_impl: str): rotary_emb_w_meta_info = None if rope: rotary_embedding = gen_rotary_embedding( - rope_head_dim=cfg['d_model'] // cfg['n_heads'], rope_impl=pos_emb_config['rope_impl'], rope_theta=pos_emb_config['rope_theta'], rope_dail_config=pos_emb_config.get('rope_dail_config', {}), rope_hf_config=pos_emb_config.get('rope_hf_config', {}), max_seq_len=s, + d_model=cfg['d_model'], + n_heads=cfg['n_heads'], ).to(device) pos = torch.arange(s).unsqueeze(0).to(device=device) # adjust the position indices to account for padding tokens diff --git a/tests/models/test_rope_dail_vs_hf.py b/tests/models/test_rope_dail_vs_hf.py index 6a41e64f48..34fb23f670 100644 --- a/tests/models/test_rope_dail_vs_hf.py +++ b/tests/models/test_rope_dail_vs_hf.py @@ -77,12 +77,13 @@ def test_rope_dail_vs_hf(attn_type: str, seq_len: int, device: str = 'cuda'): } dail_rope = gen_rotary_embedding( - rope_head_dim=cfg.d_model // cfg.n_heads, rope_impl=dail_rope_config['rope_impl'], rope_theta=dail_rope_config['rope_theta'], rope_dail_config=dail_rope_config['rope_dail_config'], rope_hf_config={}, max_seq_len=seq_len, + d_model=cfg.d_model, + n_heads=cfg.n_heads, ).to('cuda') dail_rope_w_meta_info = { 'impl': 'dail', @@ -92,12 +93,13 @@ def test_rope_dail_vs_hf(attn_type: str, seq_len: int, device: str = 'cuda'): } hf_rope = gen_rotary_embedding( - rope_head_dim=cfg.d_model // cfg.n_heads, rope_impl=hf_rope_config['rope_impl'], rope_theta=hf_rope_config['rope_theta'], rope_dail_config={}, rope_hf_config=hf_rope_config['rope_hf_config'], max_seq_len=seq_len, + d_model=cfg.d_model, + n_heads=cfg.n_heads, ).to('cuda') pos = torch.arange(seq_len).unsqueeze(0).to(device='cuda') # adjust the position indices to account for padding tokens diff --git a/tests/models/test_rope_scaling.py b/tests/models/test_rope_scaling.py new file mode 100644 index 0000000000..484ac2b23a --- /dev/null +++ b/tests/models/test_rope_scaling.py @@ -0,0 +1,35 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding + +from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding + +rope_config = { + 'rope_theta': 500000.0, + 'rope_impl': 'hf', + 'rope_hf_config': { + 'factor': 8.0, + 'low_freq_factor': 1.0, + 'high_freq_factor': 4.0, + 'original_max_position_embeddings': 8192, + 'type': 'llama3', + }, +} + +rope_dail_config = {} + + +def test_rope_scaling(): + d_model = 128 + n_heads = 32 + max_seq_len = 65536 + + embedding = gen_rotary_embedding( + d_model=d_model, + n_heads=n_heads, + rope_dail_config=rope_dail_config, + max_seq_len=max_seq_len, + **rope_config, + ) + + assert isinstance(embedding, LlamaRotaryEmbedding) From cfab70ede4d2f15c527ff6ca4a4960324a9d8d1a Mon Sep 17 00:00:00 2001 From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com> Date: Wed, 24 Jul 2024 11:48:45 -0700 Subject: [PATCH 34/57] minor (#1394) --- llmfoundry/models/mpt/modeling_mpt.py | 12 ++++++------ tests/models/test_model.py | 9 ++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 7dfaf8562b..0bcd587084 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -49,10 +49,10 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from transformers.models.llama.modeling_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding -from transformers.models.llama.modeling_llama import \ - LlamaRotaryEmbedding as HFRotaryEmbedding +from transformers.models.llama.modeling_llama import ( + LlamaConfig, + LlamaRotaryEmbedding, +) from llmfoundry.layers_registry import norms, param_init_fns from llmfoundry.models.layers.attention import ( @@ -166,7 +166,7 @@ def gen_rotary_embedding( num_attention_heads=n_heads, ) if rope_hf_config['type'] == 'no_scaling': - return HFRotaryEmbeddingFoundry(config=partial_llama_config) + return LlamaRotaryEmbeddingFoundry(config=partial_llama_config) elif rope_hf_config['type'] in {'llama3', 'linear', 'dynamic'}: return LlamaRotaryEmbedding(config=partial_llama_config) raise ValueError('rope_impl needs to be either dail or hf') @@ -341,7 +341,7 @@ def apply_sequence_id( return attn_bias -class HFRotaryEmbeddingFoundry(HFRotaryEmbedding): +class LlamaRotaryEmbeddingFoundry(LlamaRotaryEmbedding): @torch.no_grad() def forward( diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 45378e42bd..ed40e7a88a 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -35,8 +35,7 @@ ) from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.models.bloom.modeling_bloom import build_alibi_tensor -from transformers.models.llama.modeling_llama import \ - LlamaRotaryEmbedding as HFRotaryEmbedding +from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding from llmfoundry import ComposerHFCausalLM from llmfoundry.layers_registry import norms @@ -48,7 +47,7 @@ ) from llmfoundry.models.layers.blocks import MPTBlock from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM, MPTModel -from llmfoundry.models.mpt.modeling_mpt import HFRotaryEmbeddingFoundry +from llmfoundry.models.mpt.modeling_mpt import LlamaRotaryEmbeddingFoundry from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model from llmfoundry.utils.config_utils import to_dict_container @@ -2924,7 +2923,7 @@ def test_hf_rotary_child_class_builds(): list(range(max_seq_len)), ] * bsz) - rot_emb_mp = HFRotaryEmbeddingFoundry( + rot_emb_mp = LlamaRotaryEmbeddingFoundry( rope_head_dim, max_seq_len, rope_theta, @@ -2932,7 +2931,7 @@ def test_hf_rotary_child_class_builds(): ) cos_mp, sin_mp = rot_emb_mp(value, position_ids) - rot_emb = HFRotaryEmbedding( + rot_emb = LlamaRotaryEmbedding( rope_head_dim, max_seq_len, rope_theta, From e88265859de335f96b034adc9abc195c3b9719d5 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:27:48 -0700 Subject: [PATCH 35/57] Dtensor oom (#1395) --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 7127d37f40..35508cc0c7 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -455,9 +455,9 @@ def tensor_hook( state_dict[fqn] = tensor else: state_dict[fqn] = None - # Convert the state dict to the requested precision - if isinstance(tensor, torch.Tensor): - state_dict[fqn] = tensor.to(dtype=self.dtype) + + if isinstance(state_dict[fqn], torch.Tensor): + state_dict[fqn] = state_dict[fqn].to(dtype=self.dtype) del tensor if dist.get_global_rank() != 0: state_dict = {} From bb385f65cc10b62ce939d882fabe2d2a60c1cc28 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 26 Jul 2024 10:01:19 -0700 Subject: [PATCH 36/57] Add a pretrained guard for the initial meta init on global rank 0 (#1397) --- llmfoundry/models/hf/hf_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 071310d69e..f7f372f5fa 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -284,7 +284,7 @@ def _autoset_attn_implementation_monkeypatch( # the different processes. To avoid this contention, we first create the model (on meta device) on local rank # zero. This will set up the transformers model cache and avoid the future contention. if dist.get_local_rank() == 0: - if os.path.isdir(pretrained_model_name_or_path): + if pretrained and os.path.isdir(pretrained_model_name_or_path): with init_empty_weights(include_buffers=False): with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) From 7de4969e1f6ec0d45b39be3469acfa2e7c383a4d Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 26 Jul 2024 10:24:00 -0700 Subject: [PATCH 37/57] Update README.md (#1398) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0299e43710..d9b75b7617 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ DBRX is a state-of-the-art open source LLM trained by Databricks Mosaic team. It | DBRX Base | 32768 | https://huggingface.co/databricks/dbrx-base | | DBRX Instruct | 32768 | https://huggingface.co/databricks/dbrx-instruct | -Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model). +Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/blob/main/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model). For more information about the DBRX models, see https://github.com/databricks/dbrx. From d8c1552115bdb7189e04482c358a594d1d80d60d Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Fri, 26 Jul 2024 12:59:02 -0700 Subject: [PATCH 38/57] Enable passing epsilon when building norm layers (#1399) * adding eps to building norms * adding norm eps to layers and configs * adding docstrings --- llmfoundry/models/layers/attention.py | 7 +++++++ llmfoundry/models/layers/blocks.py | 7 +++++++ llmfoundry/models/layers/layer_builders.py | 2 ++ llmfoundry/models/mpt/configuration_mpt.py | 3 +++ llmfoundry/models/mpt/modeling_mpt.py | 1 + 5 files changed, 20 insertions(+) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 8e740be2b3..c7fdb5b987 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -415,6 +415,7 @@ def __init__( softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, fc_type: Optional[dict[str, Any]] = None, device: Optional[str] = None, bias: bool = True, @@ -520,6 +521,7 @@ def __init__( self.q_ln = build_norm( name=norm_type.lower(), normalized_shape=norm_size, + eps=norm_eps, device=device, ) if self.reuse_kv_layer_idx is None: @@ -528,6 +530,7 @@ def __init__( self.k_ln = build_norm( name=norm_type.lower(), normalized_shape=norm_size, + eps=norm_eps, device=device, ) @@ -796,6 +799,7 @@ def __init__( softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, fc_type: Optional[dict[str, Any]] = None, device: Optional[str] = None, bias: bool = True, @@ -814,6 +818,7 @@ def __init__( softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, + norm_eps=norm_eps, fc_type=fc_type, device=device, bias=bias, @@ -841,6 +846,7 @@ def __init__( softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, fc_type: Optional[dict[str, Any]] = None, device: Optional[str] = None, bias: bool = True, @@ -859,6 +865,7 @@ def __init__( softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, + norm_eps=norm_eps, fc_type=fc_type, device=device, bias=bias, diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py index c6988b7bd7..92735cc489 100644 --- a/llmfoundry/models/layers/blocks.py +++ b/llmfoundry/models/layers/blocks.py @@ -42,6 +42,7 @@ def __init__( ffn_config: Optional[Dict] = None, resid_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, fc_type: Optional[dict[str, Any]] = None, device: Optional[str] = None, no_bias: bool = False, @@ -84,6 +85,7 @@ def __init__( fc_type=fc_type, resid_pdrop=resid_pdrop, norm_type=norm_type, + norm_eps=norm_eps, device=device, no_bias=no_bias, ) @@ -99,6 +101,7 @@ def __init__( self.norm_1 = build_norm( name=norm_type.lower(), normalized_shape=d_model, + eps=norm_eps, device=device, ) self.attn = build_attention_layer( @@ -117,6 +120,7 @@ def __init__( self.norm_2 = build_norm( name=norm_type.lower(), normalized_shape=d_model, + eps=norm_eps, device=device, ) @@ -260,6 +264,7 @@ def __init__( fc_type: Optional[dict[str, Any]] = None, resid_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, device: Optional[str] = None, no_bias: bool = False, **kwargs: Any, @@ -283,6 +288,7 @@ def __init__( self.norm_1 = build_norm( name=norm_type.lower(), normalized_shape=d_model, + eps=norm_eps, device=device, ) self.attn = build_attention_layer( @@ -302,6 +308,7 @@ def __init__( self.norm_2 = build_norm( name=norm_type.lower(), normalized_shape=d_model, + eps=norm_eps, device=device, ) self.resid_attn_dropout = nn.Dropout(resid_pdrop) diff --git a/llmfoundry/models/layers/layer_builders.py b/llmfoundry/models/layers/layer_builders.py index 69d2059bad..d5fd1d37d4 100644 --- a/llmfoundry/models/layers/layer_builders.py +++ b/llmfoundry/models/layers/layer_builders.py @@ -26,10 +26,12 @@ def build_norm( name: str, normalized_shape: Union[int, List[int], torch.Size], + eps: Optional[float] = 1e-5, device: Optional[str] = None, ): kwargs = { 'normalized_shape': normalized_shape, + 'eps': eps, 'device': device, } diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 8ac5a8ac49..86cc3519ba 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -44,6 +44,7 @@ def __init__( no_bias: bool = False, embedding_fraction: float = 1.0, norm_type: str = 'low_precision_layernorm', + norm_eps: float = 1e-05, use_cache: bool = False, init_config: Optional[Dict] = None, fc_type: Union[str, Dict] = 'torch', @@ -101,6 +102,7 @@ def __init__( no_bias (bool): Whether to use bias in all layers. embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. norm_type (str): choose type of norm to use + norm_eps (float): epsilon value for norm layer use_cache (bool): Whether or not the model should return the last key/values attentions init_config (Dict): A dictionary used to configure the model initialization: init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', @@ -168,6 +170,7 @@ def __init__( self.no_bias = no_bias self.embedding_fraction = embedding_fraction self.norm_type = norm_type + self.norm_eps = norm_eps self.use_cache = use_cache self.init_config = init_config if init_config is not None else copy.deepcopy( init_config_defaults, diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 0bcd587084..6f9b6bf806 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -426,6 +426,7 @@ def __init__(self, config: MPTConfig): self.norm_f = build_norm( name=config.norm_type.lower(), normalized_shape=config.d_model, + eps=config.norm_eps, device=config.init_device, ) From 0c93331fee80a3438a663bc205fdb8ef4fb68fcd Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:47:09 -0700 Subject: [PATCH 39/57] Add pre register method for mlflow (#1396) --- llmfoundry/callbacks/hf_checkpointer.py | 13 +++++++++++++ .../inference/test_convert_composer_to_hf.py | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 35508cc0c7..1797c3b5b4 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -376,6 +376,17 @@ def transform_config( copied_config.ffn_config['moe_world_size'] = 1 return copied_config + def pre_register_edit(self, local_save_path: str): + """Edit the model before registering with MLflow. + + This allows a subclass to modify the model before registering with MLflow. The base class implementation will + make no modifications. + + Args: + local_save_path (str): The path to the model to be transformed. + """ + pass + def transform_model_pre_registration( self, model: PreTrainedModel, @@ -618,6 +629,8 @@ def tensor_hook( os.path.join(local_save_path, license_filename), ) + self.pre_register_edit(local_save_path,) + # Spawn a new process to register the model. process = SpawnProcess( target=_register_model_with_run_id_multiprocess, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index ffdb09ca98..9eb214e83d 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -388,6 +388,9 @@ def test_huggingface_conversion_callback_interval( checkpointer_callback.transform_model_pre_registration = MagicMock( wraps=checkpointer_callback.transform_model_pre_registration, ) + checkpointer_callback.pre_register_edit = MagicMock( + wraps=checkpointer_callback.pre_register_edit, + ) trainer = Trainer( model=original_model, device='gpu', @@ -413,9 +416,11 @@ def test_huggingface_conversion_callback_interval( metadata={}, ) assert checkpointer_callback.transform_model_pre_registration.call_count == 1 + assert checkpointer_callback.pre_register_edit.call_count == 1 assert mlflow_logger_mock.register_model_with_run_id.call_count == 1 else: assert checkpointer_callback.transform_model_pre_registration.call_count == 0 + assert checkpointer_callback.pre_register_edit.call_count == 0 assert mlflow_logger_mock.save_model.call_count == 0 assert mlflow_logger_mock.register_model_with_run_id.call_count == 0 From 799279bc19ad1ae730b1f3ec985495e6100e066f Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:43:10 -0700 Subject: [PATCH 40/57] Add pip requirements directly for mlflow save (#1400) --- llmfoundry/callbacks/hf_checkpointer.py | 6 ++++++ tests/a_scripts/inference/test_convert_composer_to_hf.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 1797c3b5b4..a186f67f14 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -613,6 +613,12 @@ def tensor_hook( ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext( ) with context_manager: + # Add the pip requirements directly to avoid mlflow + # attempting to run inference on the model + model_saving_kwargs['pip_requirements'] = [ + 'transformers', + 'torch', + ] mlflow_logger.save_model(**model_saving_kwargs) # Upload the license file generated by mlflow during the model saving. diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 9eb214e83d..cd47b2df7c 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -414,6 +414,7 @@ def test_huggingface_conversion_callback_interval( task='llm/v1/completions', input_example=ANY, metadata={}, + pip_requirements=ANY, ) assert checkpointer_callback.transform_model_pre_registration.call_count == 1 assert checkpointer_callback.pre_register_edit.call_count == 1 @@ -594,6 +595,7 @@ def _assert_mlflow_logger_calls( 'task': 'llm/v1/completions', 'input_example': default_input_example, 'metadata': {}, + 'pip_requirements': ANY, } mlflow_logger_mock.save_model.assert_called_with(**expectation) assert mlflow_logger_mock.register_model_with_run_id.call_count == 1 From 5e07a05dd2f727928729ad23c26ce68ec8349286 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sat, 27 Jul 2024 00:48:09 -0700 Subject: [PATCH 41/57] Remove orig params false default (#1401) --- llmfoundry/utils/config_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 48290bd7c5..dcb97eb0de 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -531,7 +531,6 @@ def process_init_device(model_cfg: Dict[str, Any], fsdp_config: Optional[Dict]): fsdp_config['sync_module_states'] = True # Set defaults for mixed initialization - fsdp_config.setdefault('use_orig_params', False) fsdp_config.setdefault('load_monolith_rank0_only', True) # Set ffn_config.device_mesh to fsdp_config.device_mesh From 5c7e99bed1dfe28ed4fe61a30c15a2b2d4527b8c Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sun, 28 Jul 2024 18:39:24 -0700 Subject: [PATCH 42/57] Add spin_dataloaders flag (#1405) --- llmfoundry/command_utils/train.py | 1 + llmfoundry/utils/config_utils.py | 1 + 2 files changed, 2 insertions(+) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 77bb9dbcfe..c925e6e586 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -544,6 +544,7 @@ def train(cfg: DictConfig) -> Trainer: dist_timeout=train_cfg.dist_timeout, profiler=profiler, compile_config=compile_config, + spin_dataloaders=train_cfg.spin_dataloaders, ) # Optionally just save an HF checkpoint diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index dcb97eb0de..84a3376718 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -167,6 +167,7 @@ class TrainConfig: # Dataloader device_train_microbatch_size: Union[str, int, float] = 'auto' global_train_batch_size: Optional[int] = None + spin_dataloaders: bool = True # Eval dataloader eval_subset_num_batches: int = -1 From 6d5d016b25cdae64e403ef4557112cb69962eeb2 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:40:53 -0400 Subject: [PATCH 43/57] Remove curriculum learning error when duration less than saved timestamp (#1406) Co-authored-by: Saaketh Narayan --- .../callbacks/curriculum_learning_callback.py | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py index 98a672f8db..449ab338bc 100644 --- a/llmfoundry/callbacks/curriculum_learning_callback.py +++ b/llmfoundry/callbacks/curriculum_learning_callback.py @@ -128,18 +128,17 @@ def after_load(self, state: State, logger: Logger): self._validate_dataloader(state.train_dataloader) # If checkpoint was saved before iteration was incremented, we need to increment it now + duration = self._schedule[self._schedule_index]['duration'] if (( - self._schedule[self._schedule_index]['duration'].unit - == TimeUnit.TOKEN and state.timestamp.token_in_iteration >= - self._schedule[self._schedule_index]['duration'].value + duration.unit == TimeUnit.TOKEN and + state.timestamp.token_in_iteration >= duration.value ) or ( - self._schedule[self._schedule_index]['duration'].unit - == TimeUnit.EPOCH and state.timestamp.epoch_in_iteration >= - self._schedule[self._schedule_index]['duration'].value + duration.unit == TimeUnit.EPOCH and + state.timestamp.epoch_in_iteration >= duration.value )): log.warning(( - 'The CurriculumLearning callback has detected that the previous run did not correctly ' - 'increment the iteration.' + 'The CurriculumLearning callback has detected that the ' + 'previous run did not correctly increment the iteration.' )) self._schedule_index += 1 state.timestamp = state.timestamp.to_next_iteration() @@ -199,24 +198,13 @@ def load_state_dict(self, state: dict[str, Any]): f'Expected {saved_loader} but got {current_loader}', )) - # Ensure that the current datamix duration is greater than timestamp + # Ensure that the current datamix duration is in the correct units duration = self._schedule[self._schedule_index]['duration'] if duration.unit != TimeUnit.TOKEN and duration.unit != TimeUnit.EPOCH: raise ValueError(( f'Duration must be in terms of tokens or epochs, but got ', f'{duration.unit}.', )) - if (( - duration.unit == TimeUnit.TOKEN and - duration > state['timestamp'].token_in_iteration - ) or ( - duration.unit == TimeUnit.EPOCH and - duration > state['timestamp'].epoch_in_iteration - )): - raise ValueError(( - 'The duration of the current datamix must be less or equal to ' - 'than the saved timestamp.' - )) def _build_train_loader( self, From 6f4aa8c98cd48511895b46527585ead3786c622f Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 29 Jul 2024 11:02:42 -0700 Subject: [PATCH 44/57] Set pretrained model name correctly, if provided, in HF Checkpointer (#1407) --- llmfoundry/callbacks/hf_checkpointer.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index a186f67f14..79dc73de98 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -217,6 +217,14 @@ def __init__( ) self.mlflow_logging_config = mlflow_logging_config + if 'metadata' in self.mlflow_logging_config: + self.pretrained_model_name = self.mlflow_logging_config[ + 'metadata'].get( + 'pretrained_model_name', + None, + ) + else: + self.pretrained_model_name = None self.huggingface_folder_name_fstr = os.path.join( 'huggingface', @@ -529,6 +537,16 @@ def tensor_hook( original_tokenizer, ) + # Ensure that the pretrained model name is correctly set on the saved HF checkpoint. + if self.pretrained_model_name is not None: + new_model_instance.name_or_path = self.pretrained_model_name + if self.using_peft: + new_model_instance.base_model.name_or_path = self.pretrained_model_name + for k in new_model_instance.peft_config.keys(): + new_model_instance.peft_config[ + k + ].base_model_name_or_path = self.pretrained_model_name + log.debug('Saving Hugging Face checkpoint to disk') # This context manager casts the TE extra state in io.BytesIO format to tensor format # Needed for proper hf ckpt saving. @@ -624,10 +642,7 @@ def tensor_hook( # Upload the license file generated by mlflow during the model saving. license_filename = _maybe_get_license_filename( local_save_path, - self.mlflow_logging_config['metadata'].get( - 'pretrained_model_name', - None, - ), + self.pretrained_model_name, ) if license_filename is not None: mlflow_logger._mlflow_client.log_artifact( From 7a7f6df33f9e9938c4b0e82a753ae93d3b43d8c9 Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Mon, 29 Jul 2024 16:40:09 -0700 Subject: [PATCH 45/57] Enable QuickGelu Function for CLIP models (#1408) * enabling quick_gelu fn * better docformat * test for act_fn * fix comments * changes for pre-commit --- llmfoundry/models/layers/ffn.py | 24 +++++++++-- tests/models/layers/test_ffn.py | 73 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 tests/models/layers/test_ffn.py diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py index a28725ee0f..8028a65a8b 100644 --- a/llmfoundry/models/layers/ffn.py +++ b/llmfoundry/models/layers/ffn.py @@ -53,6 +53,19 @@ } +def quickgelu_activation(input: torch.Tensor) -> torch.Tensor: + """Applies GELU approximation that is fast but somewhat inaccurate. + + Args: + input (torch.Tensor): Input tensor of shape(*), where * means any + number of dimensions + + Returns: + torch.Tensor: Tensor with same shape as input tensor + """ + return input * torch.sigmoid(1.702 * input) + + def resolve_ffn_act_fn( config: Optional[dict] = None, ) -> Callable[[torch.Tensor], torch.Tensor]: @@ -70,10 +83,13 @@ def resolve_ffn_act_fn( config = _FFN_ACT_FN_DEFAULT config = deepcopy(config) name = config.pop('name') - if not hasattr(torch.nn.functional, name): - raise ValueError(f'Unrecognized activation function name ({name}).') - act = getattr(torch.nn.functional, name) - return partial(act, **config) + if name == 'quick_gelu': + return quickgelu_activation + else: + if not hasattr(torch.nn.functional, name): + raise ValueError(f'Unrecognized activation function name ({name}).') + act = getattr(torch.nn.functional, name) + return partial(act, **config) _DEFAULT_ACT_FN = resolve_ffn_act_fn(_FFN_ACT_FN_DEFAULT) diff --git a/tests/models/layers/test_ffn.py b/tests/models/layers/test_ffn.py new file mode 100644 index 0000000000..bb78763f58 --- /dev/null +++ b/tests/models/layers/test_ffn.py @@ -0,0 +1,73 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +import torch.distributed as dist +import torch.nn as nn + +from llmfoundry.models.layers.ffn import quickgelu_activation +from llmfoundry.models.layers.layer_builders import build_ffn + + +@pytest.mark.gpu +def test_quickgelu_activation(): + d_model = 32 + expansion_ratio = 1 + no_bias = True + ffn_config = { + 'ffn_act_fn': { + 'name': 'quick_gelu', + }, + 'ffn_type': 'mptmlp', + } + rank: int = dist.get_rank() + device_str = f'cuda:{rank}' + device: torch.device = torch.device(device_str) + + ffn1 = build_ffn( + name=ffn_config['ffn_type'], + d_model=d_model, + expansion_ratio=expansion_ratio, + device=device_str, + bias=not no_bias, + ffn_kwargs=ffn_config, + ) + assert ( + ffn1.act == quickgelu_activation + ), f'Expected quick_gelu activation function, got {ffn1.act}' + + ffn_config = { + 'ffn_act_fn': { + 'name': 'gelu', + }, + 'ffn_type': 'mptmlp', + } + ffn2 = build_ffn( + name=ffn_config['ffn_type'], + d_model=d_model, + expansion_ratio=expansion_ratio, + device=device_str, + bias=not no_bias, + ffn_kwargs=ffn_config, + ) + + def num_params(model: nn.Module) -> int: + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + return sum([p.numel() for p in model_parameters]) + + ffn1_numparams = num_params(ffn1) + ffn2_numparams = num_params(ffn2) + assert ( + ffn1_numparams == ffn2_numparams + ), 'Only activation paths should have changed, re-check modeling!' + + input_ = torch.rand(1, d_model, device=device) + output1 = ffn1(input_) + output2 = ffn2(input_) + assert ( + output1.numel() == output2.numel() + ), 'Only activation paths should have changed, re-check modeling!' + assert ( + not torch.allclose(output1, output2) + ), 'Functions are different, outputs should not match!' From 396d06f427f9f24b6fa61ed2b97faa6f1d705af7 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 30 Jul 2024 11:08:10 -0700 Subject: [PATCH 46/57] Update setup.py (#1411) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6193fdc6b4..494db8d62a 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ 'mlflow>=2.14.1,<2.15', 'accelerate>=0.25,<0.33', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', - 'mosaicml-streaming>=0.7.6,<0.8', + 'mosaicml-streaming>=0.8.0,<0.9', 'torch>=2.3.0,<2.4', 'datasets>=2.19,<2.20', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data From 9a62bfd668390f514ff5d1180803c0154403ec0d Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:39:51 -0500 Subject: [PATCH 47/57] Kevin/ghcr build (#1413) * add ghcr * remove maximize build space * Update docker.yaml * Update docker.yaml * concat * Update docker.yaml --- .github/workflows/docker.yaml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index e4e6f83551..17bb976a5d 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -24,13 +24,6 @@ jobs: base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws dep_groups: "[gpu]" steps: - - name: Maximize Build Space on Worker - uses: easimon/maximize-build-space@v4 - with: - overprovision-lvm: true - remove-dotnet: true - remove-android: true - remove-haskell: true - name: Checkout uses: actions/checkout@v3 @@ -47,6 +40,13 @@ jobs: username: ${{ secrets.DOCKER_HUB_USERNAME }} password: ${{ secrets.DOCKER_HUB_PASSWORD }} + - name: Login to GHCR + uses: docker/login-action@v2 + with: + username: ${{ secrets.GHCR_USERNAME }} + password: ${{ secrets.GHCR_TOKEN }} + registry: ghcr.io + - name: Calculate Docker Image Variables run: | set -euxo pipefail @@ -60,13 +60,17 @@ jobs: if [ "${{ github.event_name }}" == "pull_request" ]; then echo "Triggered by pull_request event." STAGING_REPO="mosaicml/ci-staging" - IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}" + GHCR_STAGING_REPO="ghcr.io/databricks-mosaic/ci-staging" + GHCR_IMAGE_TAG="${GHCR_STAGING_REPO}:${{matrix.name}}-${GIT_SHA}" + IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_IMAGE_TAG}" IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache" else # Triggered by push or workflow_dispatch event echo "Triggered by ${{ github.event_name }} event, releasing to prod" PROD_REPO="mosaicml/llm-foundry" - IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest" + GHCR_PROD_REPO="ghcr.io/databricks-mosaic/llm-foundry" + GHCR_IMAGE_TAG="${GHCR_PROD_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_PROD_REPO}:${{matrix.name}}-latest" + IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest,${GHCR_IMAGE_TAG}" IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache" fi From 9417417d811cee48432be42508833b858e215295 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 23:58:10 +0000 Subject: [PATCH 48/57] Update accelerate requirement from <0.33,>=0.25 to >=0.25,<0.34 (#1403) Updates the requirements on [accelerate](https://github.com/huggingface/accelerate) to permit the latest version. - [Release notes](https://github.com/huggingface/accelerate/releases) - [Commits](https://github.com/huggingface/accelerate/compare/v0.25.0...v0.33.0) --- updated-dependencies: - dependency-name: accelerate dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mihir Patel --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 494db8d62a..2ad11ad994 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ install_requires = [ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24', 'mlflow>=2.14.1,<2.15', - 'accelerate>=0.25,<0.33', # for HF inference `device_map` + 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', 'mosaicml-streaming>=0.8.0,<0.9', 'torch>=2.3.0,<2.4', From 78e91ef790039c57100bc83362cf01f95f07ea7c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 31 Jul 2024 00:20:28 +0000 Subject: [PATCH 49/57] Update huggingface-hub requirement from <0.24,>=0.19.0 to >=0.19.0,<0.25 (#1379) Updates the requirements on [huggingface-hub](https://github.com/huggingface/huggingface_hub) to permit the latest version. - [Release notes](https://github.com/huggingface/huggingface_hub/releases) - [Commits](https://github.com/huggingface/huggingface_hub/compare/v0.19.0...v0.24.0) --- updated-dependencies: - dependency-name: huggingface-hub dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Mihir Patel --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2ad11ad994..bc84f9fac8 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ 'onnx==1.16.1', 'onnxruntime==1.18.1', 'boto3>=1.21.45,<2', - 'huggingface-hub>=0.19.0,<0.24', + 'huggingface-hub>=0.19.0,<0.25', 'beautifulsoup4>=4.12.2,<5', # required for model download utils 'tenacity>=8.2.3,<9', 'catalogue>=2,<3', From 461a402671a4c476d8aa84027f6a866f2052d63a Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:48:00 -0400 Subject: [PATCH 50/57] Make Pytest log in color in Github Action (#1412) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 53007cafaf..e5c931f4c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,7 @@ reportMissingImports = "none" # Pytest [tool.pytest.ini_options] # By default, skip gpu tests -addopts = "--tb=short -m 'not gpu'" +addopts = "--tb=short -m 'not gpu' --color=yes" markers = [ # For distributed testing From c9d09cea11a68dfa1c822f3bb260440165c2a599 Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:40:47 -0400 Subject: [PATCH 51/57] Read Package Version Better (#1415) Co-authored-by: Mihir Patel --- llmfoundry/__init__.py | 4 ++-- llmfoundry/_version.py | 6 ++++++ llmfoundry/callbacks/async_eval_callback.py | 3 ++- setup.py | 18 ++++++++---------- 4 files changed, 18 insertions(+), 13 deletions(-) create mode 100644 llmfoundry/_version.py diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index 8dbd180c0a..b851aaa559 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -50,6 +50,7 @@ tokenizers, utils, ) +from llmfoundry._version import __version__ from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric from llmfoundry.models.hf import ComposerHFCausalLM @@ -63,6 +64,7 @@ from llmfoundry.optim import DecoupledLionW __all__ = [ + '__version__', 'StreamingFinetuningDataset', 'StreamingTextDataset', 'InContextLearningDataset', @@ -87,5 +89,3 @@ 'tokenizers', 'utils', ] - -__version__ = '0.11.0.dev0' diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py new file mode 100644 index 0000000000..4c11746b43 --- /dev/null +++ b/llmfoundry/_version.py @@ -0,0 +1,6 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""The LLM Foundry Version.""" + +__version__ = '0.11.0.dev' diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py index 646d86c8d3..1b3c31e861 100644 --- a/llmfoundry/callbacks/async_eval_callback.py +++ b/llmfoundry/callbacks/async_eval_callback.py @@ -557,7 +557,8 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run: installation_path = i['path'] if not found_llm_foundry: - from llmfoundry import __version__ as latest_foundry_version + from llmfoundry._version import \ + __version__ as latest_foundry_version # If github integration is not found, foundry is likely installed # through the run command. In this case, we'll add the integration diff --git a/setup.py b/setup.py index bc84f9fac8..185d3970f7 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import copy import os -import re +from typing import Any, Dict, Mapping import setuptools from setuptools import setup @@ -15,17 +15,15 @@ _REPO_REAL_PATH = os.path.dirname(os.path.realpath(__file__)) _PACKAGE_REAL_PATH = os.path.join(_REPO_REAL_PATH, _PACKAGE_DIR) -# Read the repo version +# Read the llm-foundry version # We can't use `.__version__` from the library since it's not installed yet -with open(os.path.join(_PACKAGE_REAL_PATH, '__init__.py')) as f: +version_path = os.path.join(_PACKAGE_REAL_PATH, '_version.py') +with open(version_path, encoding='utf-8') as f: + version_globals: Dict[str, Any] = {} + version_locals: Mapping[str, object] = {} content = f.read() -# regex: '__version__', whitespace?, '=', whitespace, quote, version, quote -# we put parens around the version so that it becomes elem 1 of the match -expr = re.compile( - r"""^__version__\s*=\s*['"]([0-9]+\.[0-9]+\.[0-9]+(?:\.\w+)?)['"]""", - re.MULTILINE, -) -repo_version = expr.findall(content)[0] + exec(content, version_globals, version_locals) + repo_version = str(version_locals['__version__']) # Use repo README for PyPi description with open('README.md', 'r', encoding='utf-8') as fh: From f670108ae677a87312ca25505fac7d00d6df4d2b Mon Sep 17 00:00:00 2001 From: Jose Javier <26491792+josejg@users.noreply.github.com> Date: Wed, 31 Jul 2024 14:40:58 -0700 Subject: [PATCH 52/57] Log original config (#1410) Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/utils/config_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 84a3376718..6811b09e7d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -341,8 +341,6 @@ def make_dataclass_and_log_config( transforms, ) - logged_cfg.update(unstructured_config, merge=True) - arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, dataclass_fields) From ac13217cbfb0654db5cf0b467f209ebdebe52b0e Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:08:31 -0400 Subject: [PATCH 53/57] Replace pydocstyle with Ruff (#1417) --- .pre-commit-config.yaml | 11 --- .../data_prep/convert_dataset_json.py | 2 +- .../data_prep/convert_delta_to_json.py | 26 +++--- llmfoundry/data/finetuning/dataloader.py | 86 ++++++++++++++++--- llmfoundry/data/finetuning/tasks.py | 32 ++++++- llmfoundry/data/packing.py | 2 +- .../in_context_learning_evaluation.py | 27 ++++-- llmfoundry/eval/datasets/utils.py | 2 +- llmfoundry/eval/metrics/nlp.py | 2 +- llmfoundry/models/hf/hf_causal_lm.py | 1 + llmfoundry/models/layers/attention.py | 1 + llmfoundry/models/layers/ffn.py | 1 + llmfoundry/models/mpt/configuration_mpt.py | 1 + llmfoundry/tokenizers/tiktoken.py | 1 + llmfoundry/utils/builders.py | 11 ++- .../utils/checkpoint_conversion_helpers.py | 1 + llmfoundry/utils/config_utils.py | 3 + llmfoundry/utils/model_download_utils.py | 4 +- llmfoundry/utils/registry_utils.py | 2 + pyproject.toml | 40 ++++++--- 20 files changed, 187 insertions(+), 69 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dc2e3f55cd..b45021dd8c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -77,17 +77,6 @@ repos: hooks: - id: docformatter args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80] -- repo: https://github.com/PyCQA/pydocstyle - hooks: - - id: pydocstyle - name: pydocstyle - entry: pydocstyle - language: python - types: [python] - exclude: (.ci|.github) - additional_dependencies: - - toml - rev: 6.1.1 - repo: https://github.com/adrienverge/yamllint.git rev: v1.28.0 hooks: diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py index 9f174d1aaf..35d7e637e6 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py @@ -34,7 +34,7 @@ def build_hf_dataset( """Build an IterableDataset over the HF C4 or pile source data. Args: - dataset_name (str): Dataset name + path (str): Dataset name split (str): Split name. mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS max_length (int): The length of concatenated tokens diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index b76e457e2c..635efd54d4 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient', return the schema and drops all other responses. Args: - plan (pb2.Plan): The plan object to be executed by spark. - type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. + self (SparkConnectClient): The SparkConnectClient we are processing. + plan (pb2.Plan): The plan object to be executed by spark. + type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. Returns: - Tuple[List[Result], int, bool]: A tuple containing: - - A list of Result namedtuples, each containing a URL, row count, compressed size, - and uncompressed size of the part of the result. - - Total row count of all parts of the result. - - A boolean indicating whether the result has been truncated. + Tuple[List[Result], int, bool]: A tuple containing: + - A list of Result namedtuples, each containing a URL, row count, compressed size, + and uncompressed size of the part of the result. + - Total row count of all parts of the result. + - A boolean indicating whether the result has been truncated. """ req = self._execute_plan_request_with_metadata() req.plan.CopyFrom(plan) @@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient', ) # Create the iterator - from pyspark.sql.connect.client.reattach import \ - ExecutePlanResponseReattachableIterator + from pyspark.sql.connect.client.reattach import ( + ExecutePlanResponseReattachableIterator, + ) iterator = ExecutePlanResponseReattachableIterator( req, self._stub, @@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame', uses the `to_cf` method to execute the plan and fetch results as presigned URLs. Args: + self (pd.DataFrame): The dataframe we are processing. type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. Returns: @@ -693,8 +696,9 @@ def _check_imports(): import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2 from pyspark.sql import SparkSession from pyspark.sql.connect.client.core import SparkConnectClient - from pyspark.sql.connect.client.reattach import \ - ExecutePlanResponseReattachableIterator + from pyspark.sql.connect.client.reattach import ( + ExecutePlanResponseReattachableIterator, + ) from pyspark.sql.connect.dataframe import DataFrame from pyspark.sql.dataframe import DataFrame as SparkDataFrame from pyspark.sql.types import Row diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 11104ac706..d9450bc657 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -64,9 +64,12 @@ def build_finetuning_dataloader( on which you intend to use, as explained below. Args: - name (str): The type of dataloader to build. Must = "finetuning". - --- - *** HuggingFace dataset config fields *** + tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to + prepare the data from raw text. Any missing sentinel tokens will + be added by the collator. + device_batch_size (int, float): The size of the batches (number of examples) + that the dataloader will produce. + dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields: dataset.hf_name (str, optional): The name of the HuggingFace dataset to use. Can also be a remote http(s) directory or object store bucket containing the file {split}.jsonl in the format (prompt, response), @@ -130,16 +133,32 @@ def build_finetuning_dataloader( The script `scripts/misc/profile_packing.py` can help you choose the best packing_ratio. dataset.shuffle (bool): Whether to shuffle the dataset. - ___ See :class:`StreamingFinetuningDataset` for info on other standard config options within `dataset` that will be passed as kwargs if using the streaming codepath. - --- - tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to - prepare the data from raw text. Any missing sentinel tokens will - be added by the collator. - device_batch_size (int, float): The size of the batches (number of examples) - that the dataloader will produce. + num_workers (int, optional): How many subprocesses to use for data loading. + 0 means that the data will be loaded in the main process. The default is 0. + This argument is passed directly to the pytorch :class:`DataLoader`. + drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset + size is not divisible by the batch size. If False and the size of dataset is + not divisible by the batch size, then the last batch will be smaller. The + default is False. This argument is passed directly to the pytorch :class:`DataLoader`. + pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA + pinned memory before returning them. If your data elements are a custom type, or your + `collate_fn` returns a batch that is a custom type. This argument is passed directly to + the pytorch :class:`DataLoader`. + prefetch_factor (int, optional): Number of batches loaded in advance by each worker. + 2 means there will be a total of 2 * num_workers batches prefetched across all workers. + (default value depends on the set value for num_workers. If value of num_workers=0 default + is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed + directly to the pytorch :class:`DataLoader`. + persistent_workers (bool, optional): If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This allows to maintain the workers + Dataset instances alive. The default is False. This argument is passed directly to the + pytorch :class:`DataLoader`. + timeout (int, optional): If positive, the timeout value for collecting a batch from workers. + Should always be non-negative. The default is 0. This argument is passed directly to the + pytorch :class:`DataLoader`. See :class:`DataLoader` for standard argument options to the pytorch dataloader, such as `drop_last`, `num_workers`, etc. @@ -357,7 +376,50 @@ def _validate_config( the other. Args: - dataset_cfg (DictConfig): The dataset configuration to be validated. + max_seq_len (int): The maximum length of sequences + in the batch. See :class:`Seq2SeqFinetuningCollator` docstring + for details. + decoder_only_format (bool): Whether to format the + examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator` + docstring for details. + hf_name (str, optional): The name of the HuggingFace dataset + to use. Can also be a remote http(s) directory or object store bucket + containing the file {split}.jsonl in the format (prompt, response), + in which case the builder will create a HuggingFace dataset. + local (str, optional): Local path where remote data + will be streamed to. Only valid if `cfg.dataset.remote` has + also been set. + remote (str, optional): Location of a MDS-formatted + streaming dataset to use. Setting this will tell the builder + to create a streaming dataset rather than a HuggingFace dataset. + hf_kwargs (DictConfig, optional): Additional kwargs to + pass to `datasets.load_dataset`, which can be used to load + a dataset from local files. + preprocessing_fn (str, optional): The name/import path of + the preprocessing function to use for formatting the data examples. + If ``None`` (default), the builder will use the preprocessing function + registered under `hf_name` (see `tasks.py`), if one exists, + otherwise it will skip preprocessing. + If `preprocessing_fn` corresponds to a registered preprocessing + function in `tasks.py`, the builder will use that. + Otherwise, it will interpret `preprocessing_fn` as a + "import.path:function_name" import path; e.g., it will call + `from import.path import function_name` and use the imported + function as the preprocessing function. + safe_load (bool, optional): Whether to enforce safe loading of the dataset. + If `None`, will default to not applying any safe loading. + streams (Dict[str, Any], optional): A dictionary with multiple data streams. + If `None`, will assume no streams. + target_prompts (str): Which prompts are used as training targets. + Defaults to "none", meaning prompts are never used as training targets. + See :class:`Seq2SeqFinetuningCollator` docstring for details. + target_responses (str): Which responses are used as training targets. + Defaults to "last", meaning only the final response in multi-turn examples + will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for + details. + kwargs (DictConfig, optional): Additional kwargs to + pass to `datasets.load_dataset`, which can be used to load + a dataset from local files. Raises: ValueError: If the dataset configuration does not meet the requirements. @@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: completed, the function removes the signal file. Args: - hf_name (str): The path of the HuggingFace dataset to download. + remote_path (str): The path of the HuggingFace dataset to download. split (str): The dataset split to download (e.g., 'train', 'validation', 'test'). Returns: diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 78bfb9c74c..397b619e73 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -162,7 +162,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool: Args: dirpath (str): Directory path to check. - Returns + Returns: True if directory is empty or non-existent. False otherwise. """ return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0 @@ -820,9 +820,33 @@ def build_from_hf( Note: This function will drop examples where the prompt is longer than the max_seq_len Args: - cfg (DictConfig): The dataset configuration. - max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped. - tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset. + dataset_name (str): The name of the HuggingFace dataset + to use. Can also be a remote http(s) directory or object store bucket + containing the file {split}.jsonl in the format (prompt, response), + in which case the builder will create a HuggingFace dataset. + split (str): The split of the HuggingFace dataset. + safe_load (bool, optional): Whether to enforce safe loading of the dataset. + If `None`, will default to not applying any safe loading. + max_seq_len (int): The maximum length of sequences + in the batch. See :class:`Seq2SeqFinetuningCollator` docstring + for details. + preprocessing_fn (Callable, optional): The preprocessing function to use for + formatting the data examples. + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing + the HuggingFace dataset. + target_prompts (str): Which prompts are used as training targets. + Defaults to "none", meaning prompts are never used as training targets. + See :class:`Seq2SeqFinetuningCollator` docstring for details. + target_responses (str): Which responses are used as training targets. + Defaults to "last", meaning only the final response in multi-turn examples + will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for + details. + decoder_only_format (bool): Whether to format the + examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator` + docstring for details. + hf_kwargs (DictConfig, optional): Additional kwargs to + pass to `datasets.load_dataset`, which can be used to load + a dataset from local files. Returns: Dataset: The tokenized dataset. diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index a6fdf34953..5579066f89 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -337,7 +337,7 @@ def auto_packing_ratio( dataloader_cfg (DictConfig): The dataloader configuration for profiling. tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling. device_batch_size (int): The size of the batches (number of examples) per device. - num_packing_ratio (int): The number of packing ratios to try. + num_packing_ratios (int): The number of packing ratios to try. Returns: A packing ratio that minimizes padding while maintaining zero waste. diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py index 8a8b9de551..4e49be3fba 100644 --- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py +++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py @@ -251,8 +251,9 @@ def read_dataset( """ from datasets import \ Dataset as HFDataset # pyright: ignore[reportGeneralTypeIssues] - from datasets import \ - load_dataset # pyright: ignore[reportGeneralTypeIssues] + from datasets import ( # pyright: ignore[reportGeneralTypeIssues] + load_dataset, + ) if 'hf://' in dataset_uri: dataset_uri = dataset_uri.replace('hf://', '') if hf_loading_vars is None: @@ -363,6 +364,7 @@ def get_answer_from_example( Args: example (Dict): The example from which to retrieve the answer + in_context (bool): Whether this is an in-context example. Default to False. Returns: str: The answer in the example @@ -712,6 +714,7 @@ def get_answer_from_example( Args: example (Dict): The example from which to retrieve the answer + in_context (bool): Whether this is an in-context example. Default to False. Returns: str: The answer in from the example with chain of thought and delimiter if needed @@ -731,7 +734,7 @@ def tokenize_example( Args: prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context - ctx (str): The specific example's derived context + ctxt (str): The specific example's derived context example (Dict): The example as a dictionary. Returns: @@ -1035,6 +1038,7 @@ def get_answer_from_example( Args: example (Dict): The example from which to retrieve the answer + in_context (bool): Whether this is an in-context example. Default to False. Returns: str: The full string of the correct answer based on the 'gold' key @@ -1053,7 +1057,7 @@ def tokenize_example( Args: prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context - ctx (str): The specific example's derived context + ctxt (str): The specific example's derived context example (Dict): The example as a dictionary. Returns: @@ -1129,6 +1133,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: since the batch may consist of multiple questions, the choice_groupings indicates which contiguous sequences of elements in the batch correspond to which question gold_indices indicates which of the [0, N-1] choices is the correct one for each question. + Args: data (List): List of tokenized datapoints (dicts returned by self._tokenize_example) @@ -1168,6 +1173,7 @@ def split_batch(self, batch: Any, and real example, which refers to one possible continuation. As example count and microbatch_size are tracked in logical example, we split logical attributes by microbatch_size and real attributes by microbatch_size * num_choices. + Args: batch (Dict): Batch of data microbatch_size (int | float): Size of microbatches @@ -1419,7 +1425,7 @@ def tokenize_example( Args: prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context - ctx (str): The specific example's derived context + context_options (str): A list of contexts for this specific example. example (Dict): The example as a dictionary. Returns: @@ -1548,6 +1554,10 @@ def partition_dataset_by_category( Args: dataset_uri (str): Location of dataset. destination_path (str): Base destination path, we will write a separate partition off this URI for each category. + hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF. + hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}. + Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset. + Raises: MissingConditionalImportError: If datasets not installed raise exception. @@ -1643,8 +1653,7 @@ def get_icl_task_dataloader( # At this point, hf_model is randomly initialized composer_model = HuggingFaceModel(hf_model, hf_tokenizer) - Example: - + Example: .. testcode:: @@ -1685,8 +1694,8 @@ def get_icl_task_dataloader( hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF. hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}. Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset. - kwargs (Dict[str, Any], default=None): Dictionary containing a mapping - from ICL dataset constructor's parameter names and their desired values. + destination_path: Where the dataloader will be saved. + kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values. Returns: DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided. diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py index 1ce249437d..c19ae15dd9 100644 --- a/llmfoundry/eval/datasets/utils.py +++ b/llmfoundry/eval/datasets/utils.py @@ -130,7 +130,7 @@ def make_padded_input( Args: context_enc (List): The encoded input to the model continuation_enc (List): The encoded desired output for the example - max_seq_list (int): Maximum length sequences can be + max_seq_len (int): Maximum length sequences can be pad_tok_id (int): The token id we pad with padding_side (str): Which side to pad the context on. Can be 'right' or 'left diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py index 3ee30ebf5e..f0fbba3ece 100644 --- a/llmfoundry/eval/metrics/nlp.py +++ b/llmfoundry/eval/metrics/nlp.py @@ -80,7 +80,7 @@ def update( Args: batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed to compute the metric. - output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids` + outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`. labels (torch.Tensor): The correct outputs. Raises: diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index f7f372f5fa..34ce22d694 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -205,6 +205,7 @@ def build_inner_model( use_auth_token (bool): Whether to use an authentication token. config_overrides (Dict[str, Any]): The configuration overrides. load_in_8bit (bool): Whether to load in 8-bit. + pretrained (bool): Whether the model is pretrained. prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False. Returns: diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index c7fdb5b987..3e365edc47 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -606,6 +606,7 @@ def get_qkv( Args: x (torch.Tensor): The input tensor. + prev_layer_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): The key value of the previous layer. Returns: query (torch.Tensor): The query tensor. diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py index 8028a65a8b..f5d6d67040 100644 --- a/llmfoundry/models/layers/ffn.py +++ b/llmfoundry/models/layers/ffn.py @@ -429,6 +429,7 @@ def set_ffn_device_mesh( ffn (nn.Module): The FFN module. moe_world_size (int): The MoE world size. device_mesh (DeviceMesh): The full device mesh. + get_fsdp_submesh (Callable[[DeviceMesh], DeviceMesh]): A function to get the fsdp submesh. Raises: RuntimeError: If the device mesh is 3D. diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 86cc3519ba..9671eb6ed5 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -147,6 +147,7 @@ def __init__( reuse_kv_layer: attn_config: reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse + kwargs (Any): Other relevant keyword arguments. """ self.d_model = d_model self.n_heads = n_heads diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index f087664344..fd0fc5948a 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -90,6 +90,7 @@ def __init__( errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. Defaults to `"replace"`. + kwargs (Any): Other relevant keyword arguments. """ try: import tiktoken diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 9f18c31ec6..000155f1a4 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -35,8 +35,9 @@ from llmfoundry import registry from llmfoundry.callbacks import EvalGauntlet from llmfoundry.data.dataloader import build_dataloader -from llmfoundry.eval.datasets.in_context_learning_evaluation import \ - get_icl_task_dataloader +from llmfoundry.eval.datasets.in_context_learning_evaluation import ( + get_icl_task_dataloader, +) from llmfoundry.utils.config_utils import to_dict_container, to_list_container from llmfoundry.utils.registry_utils import construct_from_registry @@ -191,7 +192,8 @@ def build_load_planner(name: str, **kwargs: Any) -> LoadPlanner: """Builds a load planner from the registry. Args: - name: Name of the load planner to build. + name (str): Name of the load planner to build. + kwargs (Any): Other relevant keyword arguments. Returns: LoadPlanner: The load planner. @@ -210,7 +212,8 @@ def build_save_planner(name: str, **kwargs: Any) -> SavePlanner: """Builds a save planner from the registry. Args: - name: Name of the save planner to build. + name (str): Name of the save planner to build. + kwargs (Any): Other relevant keyword arguments. Returns: savePlanner: The save planner. diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py index 905afd6edb..5c65a7475e 100644 --- a/llmfoundry/utils/checkpoint_conversion_helpers.py +++ b/llmfoundry/utils/checkpoint_conversion_helpers.py @@ -177,6 +177,7 @@ def _convert_weight_to_ft_each( tensor_name (str): Name of the weight tensor. Used in naming the output file. config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters. data (np.ndarray): Tensor data in np.ndarray format. + np_weight_data_type (np.dtype): Data type of the numpy array `data`. Returns: None: Writes to a file in `save_dir`. File name is based on the `tensor_name` diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 6811b09e7d..f10fe32735 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -466,6 +466,9 @@ def update_config_with_batch_size_info( Args: cfg (Dict[str, Any]): The config to update. + device_train_batch_size (Union[int, float]): The batch size of the training dataset for each device. + device_train_microbatch_size (Union[int, float, Literal['auto']]): The microbatch size of the training dataset for each device. + device_train_grad_accum (Union[int, Literal['auto']]): The gradient accumulation settings for each device. Returns: Dict[str, Any]: The updated config. diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index dde8240d8b..9609982fda 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -69,7 +69,7 @@ def download_from_hf_hub( Safetensors weights will be downloaded unless `prefer_safetensors` is set to False. Args: - repo_id (str): The Hugging Face Hub repo ID. + model (str): The Hugging Face Hub repo ID. save_dir (str, optional): The local path to the directory where the model files will be downloaded. prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are available. Defaults to True. @@ -157,7 +157,7 @@ def _recursive_download( Args: session: A requests.Session through which to make requests to the remote server. - url (str): The base URL where the files are located. + base_url (str): The base URL where the files are located. path (str): The path from the base URL to the files to download. The full URL for the download is equal to '/'. save_dir (str): The directory to save downloaded files to. diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py index 3ea7cc58a7..f96e72b3a2 100644 --- a/llmfoundry/utils/registry_utils.py +++ b/llmfoundry/utils/registry_utils.py @@ -127,6 +127,7 @@ def construct_from_registry( before constructing the item to return. This should throw an exception if validation fails. Defaults to None. post_validation_function (Optional[Callable[[Any], None]], optional): An optional validation function called after constructing the item to return. This should throw an exception if validation fails. Defaults to None. + kwargs (Optional[Dict[str, Any]]): Other relevant keyword arguments. Raises: ValueError: If the validation functions failed or the registered item is invalid @@ -176,6 +177,7 @@ def import_file(loc: Union[str, Path]) -> ModuleType: """Import module from a file. Used to run arbitrary python code. + Args: name (str): Name of module to load. loc (str / Path): Path to the file. diff --git a/pyproject.toml b/pyproject.toml index e5c931f4c5..fdbabfff96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,23 +11,44 @@ skip = [ "env", "wandb", "runs", "build", "node_modules" ] include_trailing_comma = true split_on_trailing_comma = true +# Ruff global +[tool.ruff] +exclude = [ + "build/**", + "docs/**", + "node_modules/**", +] + +# Ruff linter [tool.ruff.lint] select = [ "C4", - # TODO port pydocstyle - # "D", # pydocstyle "LOG", "PERF", "PLE", "COM812", + "D", # pydocstyle ] -[tool.ruff] -exclude = [ - "build/**", - "docs/**", - "node_modules/**", + +extend-select = ["D404"] # pydocstyle + +ignore = [ + "D100", + "D101", + "D102", + "D103", + "D104", + "D105", + "D107", + "D400", + "D401", + "D415", ] +[tool.ruff.lint.pydocstyle] +convention = "google" + + # Coverage [tool.coverage.run] parallel = true @@ -506,8 +527,3 @@ ignore_patterns = [ "wandb/**/*.py", "build/**/*.py", ] - -[tool.pydocstyle] -convention="google" -add_ignore="D100,D101,D102,D103,D104,D105,D107,D400,D401,D415" -add_select="D404" From cae89a2e54371ce884248a8f735d56f036823dc9 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:03:49 -0500 Subject: [PATCH 54/57] test cpu (#1416) * test cpu * Update pr-cpu.yaml * Update pr-cpu.yaml * update gpu tests * Update pr-gpu.yaml --- .github/workflows/pr-cpu.yaml | 25 ++++---- .github/workflows/pr-gpu.yaml | 108 +++++++++++++++++++--------------- 2 files changed, 74 insertions(+), 59 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 2dd1c0edab..2c85719756 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -15,23 +15,28 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: ubuntu-latest strategy: matrix: include: - name: "cpu-2.3.1" + pip_deps: "[all-cpu]" container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: "not gpu" pytest_command: "coverage run -m pytest" - name: ${{ matrix.name }} - if: github.repository_owner == 'mosaicml' - with: - container: ${{ matrix.container }} - name: ${{ matrix.name }} - pip_deps: "[all-cpu]" - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - safe_directory: llm-foundry + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Run PR CPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0 + with: + name: ${{ matrix.name }} + container: ${{ matrix.container }} + pip_deps: ${{ matrix.pip_deps }} + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + safe_directory: llm-foundry coverage: uses: ./.github/workflows/coverage.yaml name: Coverage Results diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index c5638e403d..04f30b5f9c 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -4,17 +4,19 @@ on: branches: - main - release/* - pull_request_target: + pull_request: branches: - main - release/** workflow_dispatch: +# Cancel old runs when a new commit is pushed to the same branch if not on main or dev concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: pytest-gpu-1: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: fail-fast: false matrix: @@ -22,24 +24,27 @@ jobs: - name: "gpu-2.3.1-1" container: mosaicml/llm-foundry:2.3.1_cu121-latest markers: "gpu" - pytest_command: "coverage run -m pytest" pip_deps: "[all]" - name: ${{ matrix.name }} - if: github.repository_owner == 'mosaicml' - with: - container: ${{ matrix.container }} - git_repo: mosaicml/llm-foundry - mcloud-timeout: 1800 - name: ${{ matrix.name }} - pip_deps: ${{ matrix.pip_deps }} - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 1 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} + pytest_command: "coverage run -m pytest" + ci_repo_gpu_test_ref: v0.1.0 + steps: + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + container: ${{ matrix.container }} + git_repo: mosaicml/llm-foundry + mcloud_timeout: 1800 + name: ${{ matrix.name }} + pip_deps: ${{ matrix.pip_deps }} + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 1 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }} pytest-gpu-2: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: fail-fast: false matrix: @@ -47,24 +52,27 @@ jobs: - name: "gpu-2.3.1-2" container: mosaicml/llm-foundry:2.3.1_cu121-latest markers: "gpu" - pytest_command: "coverage run -m pytest" pip_deps: "[all]" - name: ${{ matrix.name }} - if: github.repository_owner == 'mosaicml' - with: - container: ${{ matrix.container }} - git_repo: mosaicml/llm-foundry - mcloud-timeout: 1800 - name: ${{ matrix.name }} - pip_deps: ${{ matrix.pip_deps }} - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 2 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} + pytest_command: "coverage run -m pytest" + ci_repo_gpu_test_ref: v0.1.0 + steps: + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + container: ${{ matrix.container }} + git_repo: mosaicml/llm-foundry + mcloud_timeout: 1800 + name: ${{ matrix.name }} + pip_deps: ${{ matrix.pip_deps }} + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 2 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }} pytest-gpu-4: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: fail-fast: false matrix: @@ -72,19 +80,21 @@ jobs: - name: "gpu-2.3.1-4" container: mosaicml/llm-foundry:2.3.1_cu121-latest markers: "gpu" - pytest_command: "coverage run -m pytest" pip_deps: "[all]" - name: ${{ matrix.name }} - if: github.repository_owner == 'mosaicml' - with: - container: ${{ matrix.container }} - git_repo: mosaicml/llm-foundry - mcloud-timeout: 1800 - name: ${{ matrix.name }} - pip_deps: ${{ matrix.pip_deps }} - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 4 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} + pytest_command: "coverage run -m pytest" + ci_repo_gpu_test_ref: v0.1.0 + steps: + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + container: ${{ matrix.container }} + git_repo: mosaicml/llm-foundry + mcloud_timeout: 1800 + name: ${{ matrix.name }} + pip_deps: ${{ matrix.pip_deps }} + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 4 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }} From 72447df73379f01d74abde9b867aded654d465e7 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Fri, 2 Aug 2024 13:02:55 -0500 Subject: [PATCH 55/57] Update pr-gpu.yaml (#1420) * Update pr-gpu.yaml * Update pr-gpu.yaml --- .github/workflows/pr-gpu.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 04f30b5f9c..ba1a4f9ba4 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -4,7 +4,7 @@ on: branches: - main - release/* - pull_request: + pull_request_target: branches: - main - release/** @@ -16,6 +16,7 @@ concurrency: jobs: pytest-gpu-1: name: ${{ matrix.name }} + if: github.repository_owner == 'mosaicml' runs-on: linux-ubuntu-latest strategy: fail-fast: false @@ -44,6 +45,7 @@ jobs: ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }} pytest-gpu-2: name: ${{ matrix.name }} + if: github.repository_owner == 'mosaicml' runs-on: linux-ubuntu-latest strategy: fail-fast: false @@ -72,6 +74,7 @@ jobs: ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }} pytest-gpu-4: name: ${{ matrix.name }} + if: github.repository_owner == 'mosaicml' runs-on: linux-ubuntu-latest strategy: fail-fast: false From 8defa8f11369332147d7c84a877fd5d122b1c97f Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 2 Aug 2024 11:33:01 -0700 Subject: [PATCH 56/57] Additional registry entrypoint documentation (#1414) --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index d9b75b7617..e8a6708c5a 100644 --- a/README.md +++ b/README.md @@ -309,10 +309,15 @@ dependencies = [ "llm-foundry", ] +# Note: Even though in python code, this would be llmfoundry.registry.loggers, +# when specified in the entry_points, it has to be "llmfoundry_loggers". That is, +# the segments of the name should be joined by an _ in the entry_points section. [project.entry-points."llmfoundry_loggers"] my_logger = "foundry_registry.loggers:MyLogger" ``` +If developing new components via entrypoints, it is important to note that Python entrypoints are global to the Python environment. This means that if you have multiple packages that register components with the same key, the last one installed will be the one used. This can be useful for overriding components in LLM Foundry, but can also lead to unexpected behavior if not careful. Additionally, if you change the pyproject.toml, you will need to reinstall the package for the changes to take effect. You can do this quickly by installing with `pip install -e . --no-deps` to avoid reinstalling dependencies. + ### Direct call to register You can also register a component directly in your code: From 38dcf1e6e25a05dbd4275402f168bac299b45d6d Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sat, 3 Aug 2024 08:42:38 -0700 Subject: [PATCH 57/57] Remove type ignore (#1421) --- llmfoundry/utils/builders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 000155f1a4..a1d84601b3 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -127,8 +127,7 @@ def build_eval_loaders( # Load the eval data to fail fast. metrics will get added # later in add_metrics_to_eval_loaders, after the model is loaded metric_names=[], - # TODO: Fix type in Composer - device_eval_microbatch_size=device_eval_batch_size, # type: ignore + device_eval_microbatch_size=device_eval_batch_size, ) evaluators.append(eval_loader) return evaluators