From 6d53fca2614136d84ef26527181bd9f0c402a23c Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 22 Sep 2023 12:14:39 -0700 Subject: [PATCH 01/18] Add support for auto packing ratio --- llmfoundry/data/finetuning/dataloader.py | 13 +- llmfoundry/data/packing.py | 202 +++++++++++++++-------- 2 files changed, 141 insertions(+), 74 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 661b1e808d..096b7b3c9a 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -13,7 +13,7 @@ from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator from llmfoundry.data.finetuning.tasks import dataset_constructor -from llmfoundry.data.packing import BinPackWrapper +from llmfoundry.data.packing import BinPackWrapper, auto_packing_ratio log = logging.getLogger(__name__) @@ -141,7 +141,7 @@ def build_finetuning_dataloader(cfg: DictConfig, ) collate_fn, dataloader_batch_size = _build_collate_fn( - cfg.dataset, tokenizer, device_batch_size) + cfg, tokenizer, device_batch_size) return DataLoader( dataset, @@ -172,7 +172,7 @@ def build_finetuning_dataloader(cfg: DictConfig, ) collate_fn, dataloader_batch_size = _build_collate_fn( - cfg.dataset, tokenizer, device_batch_size) + cfg, tokenizer, device_batch_size) if cfg.drop_last: world_size = dist.get_world_size() @@ -355,9 +355,10 @@ def _build_hf_dataset_from_remote( def _build_collate_fn( - dataset_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]: + dataset_cfg = dataloader_cfg.dataset collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, max_seq_len=dataset_cfg.max_seq_len, @@ -374,6 +375,10 @@ def _build_collate_fn( 'the latter to turn on packing or remove the former from the config.') return collate_fn, device_batch_size + if packing_ratio == 'auto': + packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer, + device_batch_size) + if packing_ratio == 1.0: return collate_fn, device_batch_size elif packing_ratio < 1.0: diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index d0a73be801..1969cac7da 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -260,14 +260,140 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): return batch +def auto_packing_ratio(dataloader_cfg: DictConfig, + tokenizer: PreTrainedTokenizerBase, + device_batch_size: int): + """Find a packing ratio that minimizes padding with zero waste. + + Args: + dataloader_cfg (DictConfig): The dataloader configuration for profiling. + tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling. + device_batch_size (int): The size of the batches (number of examples) per device. + + Returns: + A packing ratio that minimizes padding while maintaining zero waste. + """ + min_ratio = 1 + max_ratio = 20 + num_packing_ratios = 10 + profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, + max_ratio, num_packing_ratios, + device_batch_size) + + # Obtain the maximum packing_ratio/minimum padding that has no waste. + i = 0 + waste = 0 + packing_ratio = 1 + while i < len(profiling_results) and waste == 0: + packing_ratio, _, waste = profiling_results[i] + i += 1 + return packing_ratio + + +def profile_packing(dataloader_cfg: DictConfig, + tokenizer: PreTrainedTokenizerBase, min_ratio: float, + max_ratio: float, num_packing_ratios: int, + device_batch_size: int): + """Profile packing. + + Args: + dataloader_cfg (DictConfig): The dataloader configuration for profiling. + tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling. + min_ratio (float): Smallest packing_ratio to test. Must be >=1. + max_ratio (float): Largest packing_ratio to test. Must be larger than `min_ratio`. + num_packing_ratios (int): Number of packing_ratio values (spaced between `min_ratio` and `max_ratio`) to try. + device_batch_size (int): The size of the batches (number of examples) per device. + + Returns: + A list of tuples of packing ratio, padding, and waste. + """ + import copy + + from llmfoundry import (build_finetuning_dataloader, + build_text_denoising_dataloader) + from llmfoundry.data import build_text_dataloader + + # Turn off packing for the dataloader (we want raw, pre-packed examples) + dataloader_cfg = copy.deepcopy(dataloader_cfg) + dataloader_cfg.dataset.packing_ratio = None + dataloader_cfg.dataset.max_leftovers_to_keep = None + dataloader_cfg.drop_last = False + + # Determine the packing_ratio values we'll try + packing_ratios, raw_batch_sizes = [], [] + for packing_ratio in np.linspace(min_ratio, + max_ratio, + num_packing_ratios, + endpoint=True): + packing_ratio = np.round(10 * packing_ratio) / 10 + raw_batch_size = int(packing_ratio * device_batch_size) + if raw_batch_size not in raw_batch_sizes: + packing_ratios.append(packing_ratio) + raw_batch_sizes.append(raw_batch_size) + + n_profile_examples = max(raw_batch_sizes) * 100 + + def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): + if cfg.name == 'text': + return build_text_dataloader(cfg, tokenizer, n_profile_examples) + elif cfg.name == 'text_denoising': + return build_text_denoising_dataloader(cfg, tokenizer, + n_profile_examples) + elif cfg.name == 'finetuning': + return build_finetuning_dataloader(cfg, tokenizer, + n_profile_examples) + else: + raise ValueError( + f'Not sure how to build dataloader with config: {cfg}') + + train_dataloader = build_dataloader(dataloader_cfg, tokenizer) + + # Get a bunch of raw examples + big_batch = next(iter(train_dataloader)) + + def split_big_batch(raw_batch_size: int) -> List: + input_ids = big_batch['input_ids'].split(raw_batch_size) + batches = [{'input_ids': x} for x in input_ids] + + for key in big_batch.keys(): + if key == 'input_ids': + continue + for idx, split in enumerate(big_batch[key].split(raw_batch_size)): + batches[idx].update({key: split}) + return batches + + def profile(raw_batch_size: int) -> Tuple[float, float]: + packer = BinPackWrapper( + collator=lambda x: x, + target_batch_size=device_batch_size, + max_seq_len=dataloader_cfg.dataset.max_seq_len, + pad_token_id=0, # <-- Doesn't need to be correct for profiling + padding_side='left', # <-- Doesn't need to be correct for profiling + max_leftover_bins_to_keep=max_leftovers_to_keep) + + # Simulate feeding the packing collator a bunch of data + for batch in split_big_batch(raw_batch_size): + if batch['input_ids'].shape[0] < device_batch_size: + continue + _ = packer(batch) + + # Return the padding / waste stats over that bunch of data + padding_percent = 100 * (1 - packer.efficiency) + waste_percent = 100 * packer.waste + return padding_percent, waste_percent + + results = [] + for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): + padding, waste = profile(raw_batch_size) + results.append((packing_ratio, padding, waste)) + return results + + if __name__ == '__main__': from argparse import ArgumentParser, Namespace from omegaconf import OmegaConf as om - from llmfoundry import (build_finetuning_dataloader, - build_text_denoising_dataloader) - from llmfoundry.data import build_text_dataloader from llmfoundry.utils import build_tokenizer def parse_args() -> Namespace: @@ -316,20 +442,6 @@ def parse_args() -> Namespace: raise ValueError('`num_packing_ratios` must be a positive integer.') return args - def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int): - if cfg.name == 'text': - return build_text_dataloader(cfg, tokenizer, device_batch_size) - elif cfg.name == 'text_denoising': - return build_text_denoising_dataloader(cfg, tokenizer, - device_batch_size) - elif cfg.name == 'finetuning': - return build_finetuning_dataloader(cfg, tokenizer, - device_batch_size) - else: - raise ValueError( - f'Not sure how to build dataloader with config: {cfg}') - args = parse_args() with open(args.yaml_path) as f: @@ -339,18 +451,6 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, cfg = om.create(cfg) device_batch_size = cfg.global_train_batch_size // args.num_devices - # Determine the packing_ratio values we'll try - packing_ratios, raw_batch_sizes = [], [] - for packing_ratio in np.linspace(args.min, - args.max, - args.num_packing_ratios, - endpoint=True): - packing_ratio = np.round(10 * packing_ratio) / 10 - raw_batch_size = int(packing_ratio * device_batch_size) - if raw_batch_size not in raw_batch_sizes: - packing_ratios.append(packing_ratio) - raw_batch_sizes.append(raw_batch_size) - # Fetch a bunch of raw examples once, which we'll re-use if 'train_loader' not in cfg: raise ValueError('config must define train_loader') @@ -373,51 +473,13 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - # Turn off packing for the dataloader (we want raw, pre-packed examples) - dataloader_cfg.dataset.packing_ratio = None - dataloader_cfg.dataset.max_leftovers_to_keep = None - train_dataloader = build_dataloader(dataloader_cfg, tokenizer, - max(raw_batch_sizes) * 100) - - # Get a bunch of raw examples - big_batch = next(iter(train_dataloader)) - - def split_big_batch(raw_batch_size: int) -> List: - input_ids = big_batch['input_ids'].split(raw_batch_size) - batches = [{'input_ids': x} for x in input_ids] - - for key in big_batch.keys(): - if key == 'input_ids': - continue - for idx, split in enumerate(big_batch[key].split(raw_batch_size)): - batches[idx].update({key: split}) - return batches - - def profile_packing(raw_batch_size: int) -> Tuple[float, float]: - packer = BinPackWrapper( - collator=lambda x: x, - target_batch_size=device_batch_size, - max_seq_len=dataloader_cfg.dataset.max_seq_len, - pad_token_id=0, # <-- Doesn't need to be correct for profiling - padding_side='left', # <-- Doesn't need to be correct for profiling - max_leftover_bins_to_keep=max_leftovers_to_keep) - - # Simulate feeding the packing collator a bunch of data - for batch in split_big_batch(raw_batch_size): - if batch['input_ids'].shape[0] < device_batch_size: - continue - _ = packer(batch) - - # Return the padding / waste stats over that bunch of data - padding_percent = 100 * (1 - packer.efficiency) - waste_percent = 100 * packer.waste - return padding_percent, waste_percent + results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, + args.num_packing_ratios, device_batch_size) header = '\n\n\n packing_ratio | % PADDING | % WASTE' fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' print(header) print('-' * len(header)) - for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): - padding, waste = profile_packing(raw_batch_size) + for packing_ratio, padding, waste in results: print(fstr.format(packing_ratio, padding, waste)) From 1c0f157c786cbaf60e1e452b72303ffd7948dc59 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 17 Oct 2023 06:27:31 +0000 Subject: [PATCH 02/18] Add test --- tests/test_packing.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/test_packing.py diff --git a/tests/test_packing.py b/tests/test_packing.py new file mode 100644 index 0000000000..16e2466b79 --- /dev/null +++ b/tests/test_packing.py @@ -0,0 +1,32 @@ +from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader + +from llmfoundry.utils.builders import build_tokenizer +from omegaconf import DictConfig + + +def test_auto_packing(): + dataloader_cfg = DictConfig({ + 'name': 'finetuning', + 'dataset': { + 'hf_name': 'mosaicml/dolly_hhrlhf', + 'split': 'train', + 'max_seq_len': 1024, + 'allow_pad_trimming': False, + 'decoder_only_format': True, + 'packing_ratio': 'auto', + 'shuffle': True, + }, + 'drop_last': False, + 'num_workers': 8, + 'pin_memory': False, + 'prefetch_factor': 2, + 'persistent_workers': True, + 'timeout': 0, + }) + + tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) + + dataloader = build_finetuning_dataloader(dataloader_cfg, tokenizer, 6) + + for sample in dataloader: + print(sample) \ No newline at end of file From b32bac003e1bdfb9f67917ad9d27cbd88bbd2dd8 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 19 Oct 2023 19:10:18 +0000 Subject: [PATCH 03/18] Refactor and change to generator --- llmfoundry/data/__init__.py | 3 +++ llmfoundry/data/dataloader.py | 37 +++++++++++++++++++++++++++ llmfoundry/data/packing.py | 47 +++++++++++------------------------ scripts/train/train.py | 31 +++-------------------- 4 files changed, 57 insertions(+), 61 deletions(-) create mode 100644 llmfoundry/data/dataloader.py diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py index c997c865dd..15dc588216 100644 --- a/llmfoundry/data/__init__.py +++ b/llmfoundry/data/__init__.py @@ -9,6 +9,8 @@ from llmfoundry.data.text_data import (StreamingTextDataset, build_text_dataloader) +from llmfoundry.data.dataloader import build_dataloader + __all__ = [ 'MixtureOfDenoisersCollator', 'build_text_denoising_dataloader', @@ -18,4 +20,5 @@ 'build_text_dataloader', 'NoConcatDataset', 'ConcatTokensDataset', + 'build_dataloader', ] diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py new file mode 100644 index 0000000000..0d5f453cf4 --- /dev/null +++ b/llmfoundry/data/dataloader.py @@ -0,0 +1,37 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Dataloader builder utilities.""" + +from omegaconf import DictConfig +from transformers import PreTrainedTokenizerBase + +from llmfoundry.data.text_data import build_text_dataloader + +from llmfoundry.data.denoising import build_text_denoising_dataloader + +from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader + + +def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + device_batch_size: int): + if cfg.name == 'text': + return build_text_dataloader( + cfg, + tokenizer, + device_batch_size, + ) + elif cfg.name == 'text_denoising': + return build_text_denoising_dataloader( + cfg, + tokenizer, + device_batch_size, + ) + elif cfg.name == 'finetuning': + return build_finetuning_dataloader( + cfg, + tokenizer, + device_batch_size, + ) + else: + raise ValueError(f'Not sure how to build dataloader with config: {cfg}') diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 1969cac7da..962f7a2666 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -2,14 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple import numpy as np import torch from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase - class BinPackWrapper: """Utility collator for packing to reduce padding.""" @@ -281,20 +280,19 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, device_batch_size) # Obtain the maximum packing_ratio/minimum padding that has no waste. - i = 0 - waste = 0 - packing_ratio = 1 - while i < len(profiling_results) and waste == 0: - packing_ratio, _, waste = profiling_results[i] - i += 1 - return packing_ratio + prev_packing_ratio = 1 + for packing_ratio, _, waste in profiling_results: + if waste > 0: + break + prev_packing_ratio = packing_ratio + return prev_packing_ratio def profile_packing(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, min_ratio: float, max_ratio: float, num_packing_ratios: int, - device_batch_size: int): - """Profile packing. + device_batch_size: int) -> Iterable[Tuple[float, float, float]]: + """Generator function that profiles example packing across packing ratios. Args: dataloader_cfg (DictConfig): The dataloader configuration for profiling. @@ -305,13 +303,11 @@ def profile_packing(dataloader_cfg: DictConfig, device_batch_size (int): The size of the batches (number of examples) per device. Returns: - A list of tuples of packing ratio, padding, and waste. + An iterable of tuples of packing ratio, padding, and waste. """ import copy + from llmfoundry.data.dataloader import build_dataloader - from llmfoundry import (build_finetuning_dataloader, - build_text_denoising_dataloader) - from llmfoundry.data import build_text_dataloader # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) @@ -333,20 +329,7 @@ def profile_packing(dataloader_cfg: DictConfig, n_profile_examples = max(raw_batch_sizes) * 100 - def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): - if cfg.name == 'text': - return build_text_dataloader(cfg, tokenizer, n_profile_examples) - elif cfg.name == 'text_denoising': - return build_text_denoising_dataloader(cfg, tokenizer, - n_profile_examples) - elif cfg.name == 'finetuning': - return build_finetuning_dataloader(cfg, tokenizer, - n_profile_examples) - else: - raise ValueError( - f'Not sure how to build dataloader with config: {cfg}') - - train_dataloader = build_dataloader(dataloader_cfg, tokenizer) + train_dataloader = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples) # Get a bunch of raw examples big_batch = next(iter(train_dataloader)) @@ -369,7 +352,7 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: max_seq_len=dataloader_cfg.dataset.max_seq_len, pad_token_id=0, # <-- Doesn't need to be correct for profiling padding_side='left', # <-- Doesn't need to be correct for profiling - max_leftover_bins_to_keep=max_leftovers_to_keep) + max_leftover_bins_to_keep=dataloader_cfg.dataset.max_leftovers_to_keep) # Simulate feeding the packing collator a bunch of data for batch in split_big_batch(raw_batch_size): @@ -382,11 +365,9 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: waste_percent = 100 * packer.waste return padding_percent, waste_percent - results = [] for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): padding, waste = profile(raw_batch_size) - results.append((packing_ratio, padding, waste)) - return results + yield (packing_ratio, padding, waste) if __name__ == '__main__': diff --git a/scripts/train/train.py b/scripts/train/train.py index 87217702e5..dae4d3f24b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -17,9 +17,7 @@ from transformers import PreTrainedTokenizerBase from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, - MPTForCausalLM, build_finetuning_dataloader, - build_text_denoising_dataloader) -from llmfoundry.data.text_data import build_text_dataloader + MPTForCausalLM) from llmfoundry.utils.builders import (build_algorithm, build_callback, build_icl_data_and_gauntlet, build_logger, build_optimizer, @@ -28,6 +26,8 @@ process_init_device, update_batch_size_info) +from llmfoundry.data.dataloader import build_dataloader + def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" @@ -152,31 +152,6 @@ def print_trainable_parameters(model: torch.nn.Module) -> None: f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}' ) - -def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int): - if cfg.name == 'text': - return build_text_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'text_denoising': - return build_text_denoising_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'finetuning': - return build_finetuning_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - else: - raise ValueError(f'Not sure how to build dataloader with config: {cfg}') - - def main(cfg: DictConfig) -> Trainer: # Filter deprecation warning from torch internal usage warnings.filterwarnings( From d9dcdbcfb23199a3cd91b819770ff46866aca9bd Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 19 Oct 2023 21:00:45 +0000 Subject: [PATCH 04/18] Add simple tests --- llmfoundry/data/denoising.py | 4 +- llmfoundry/data/finetuning/dataloader.py | 6 +- llmfoundry/data/packing.py | 127 +++-------------- scripts/misc/profile_packing.py | 99 ++++++++++++++ tests/test_packing.py | 165 +++++++++++++++++++---- 5 files changed, 257 insertions(+), 144 deletions(-) create mode 100644 scripts/misc/profile_packing.py diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py index d685d0077d..f7ff642be0 100644 --- a/llmfoundry/data/denoising.py +++ b/llmfoundry/data/denoising.py @@ -15,7 +15,7 @@ from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerBase -from llmfoundry.data.packing import BinPackWrapper +from llmfoundry.data.packing import BinPackCollator from llmfoundry.data.text_data import StreamingTextDataset from llmfoundry.models import utils @@ -490,7 +490,7 @@ def build_text_denoising_dataloader( raise NotImplementedError( 'On-the-fly packing is currently only supported for decoder-only formats.' ) - collate_fn = BinPackWrapper( + collate_fn = BinPackCollator( collator=collate_fn, target_batch_size=device_batch_size, max_seq_len=cfg.dataset.max_seq_len, diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 096b7b3c9a..f2a5410a66 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -13,7 +13,7 @@ from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator from llmfoundry.data.finetuning.tasks import dataset_constructor -from llmfoundry.data.packing import BinPackWrapper, auto_packing_ratio +from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio log = logging.getLogger(__name__) @@ -357,7 +357,7 @@ def _build_hf_dataset_from_remote( def _build_collate_fn( dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int -) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]: +) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: dataset_cfg = dataloader_cfg.dataset collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, @@ -389,7 +389,7 @@ def _build_collate_fn( 'On-the-fly packing is currently only supported for decoder-only formats.' ) - collate_fn = BinPackWrapper( + collate_fn = BinPackCollator( collator=collate_fn, target_batch_size=device_batch_size, max_seq_len=dataset_cfg.max_seq_len, diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 962f7a2666..904c7f39cd 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -1,15 +1,14 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import os -from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple +from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple import numpy as np import torch from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase -class BinPackWrapper: +class BinPackCollator: """Utility collator for packing to reduce padding.""" def __init__(self, @@ -57,7 +56,7 @@ def efficiency(self) -> float: def __call__( self, - examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: + examples: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: batch = self.base_collator(examples) assert 'attention_mask' in batch @@ -74,12 +73,12 @@ def __call__( # Cut everything down to size sizes, trimmed_examples = [], [] for idx in range(batch['attention_mask'].shape[0]): - size, trimmed_example = extract_trim_batch_idx(batch, idx) + size, trimmed_example = _extract_trim_batch_idx(batch, idx) sizes.append(size) trimmed_examples.append(trimmed_example) # Apply our CS 101 bin packing algorithm. - packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = first_fit_bin_packing( + packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing( sizes=sizes, examples=trimmed_examples, num_bins=self.out_size, @@ -92,14 +91,14 @@ def __call__( self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep] # Re-pad to max_seq_len and batch - batch = repad(packed_examples, + batch = _repad(packed_examples, max_seq_len=self.max_seq_len, pad_token_id=self.pad_token_id, padding_side=self.padding_side) return batch -def extract_trim_batch_idx(batch: Dict[str, torch.Tensor], +def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: example = {k: v[idx] for k, v in batch.items()} @@ -111,7 +110,7 @@ def extract_trim_batch_idx(batch: Dict[str, torch.Tensor], return size, trim_example -def combine_in_place( +def _combine_in_place( example: Dict[str, torch.Tensor], add_on: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: if 'labels' in add_on: @@ -128,7 +127,7 @@ def combine_in_place( return example -def first_fit_bin_packing( +def _first_fit_bin_packing( sizes: List[int], examples: List[Dict[str, torch.Tensor]], num_bins: int, max_bin_size: int, existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]] ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[ @@ -193,7 +192,7 @@ def first_fit_bin_packing( if bins[bidx][0] + size <= max_bin_size: bin_size, packed_example = bins.pop(bidx) bin_size = bin_size + size - packed_example = combine_in_place(packed_example, example) + packed_example = _combine_in_place(packed_example, example) bins.append((bin_size, packed_example)) added = True break @@ -224,7 +223,7 @@ def first_fit_bin_packing( bin_sizes[:num_bins]), sum(sizes), sorted_bins[num_bins:] -def repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int, +def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int, pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]: def pad_tensor(tensor: torch.Tensor, pad_value: int): @@ -273,8 +272,8 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, A packing ratio that minimizes padding while maintaining zero waste. """ min_ratio = 1 - max_ratio = 20 - num_packing_ratios = 10 + max_ratio = dataloader_cfg.dataset.max_seq_len / 100 + num_packing_ratios = 20 profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, max_ratio, num_packing_ratios, device_batch_size) @@ -285,6 +284,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, if waste > 0: break prev_packing_ratio = packing_ratio + print('packing ratio!', packing_ratio) return prev_packing_ratio @@ -308,7 +308,6 @@ def profile_packing(dataloader_cfg: DictConfig, import copy from llmfoundry.data.dataloader import build_dataloader - # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) dataloader_cfg.dataset.packing_ratio = None @@ -346,7 +345,7 @@ def split_big_batch(raw_batch_size: int) -> List: return batches def profile(raw_batch_size: int) -> Tuple[float, float]: - packer = BinPackWrapper( + packer = BinPackCollator( collator=lambda x: x, target_batch_size=device_batch_size, max_seq_len=dataloader_cfg.dataset.max_seq_len, @@ -368,99 +367,3 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): padding, waste = profile(raw_batch_size) yield (packing_ratio, padding, waste) - - -if __name__ == '__main__': - from argparse import ArgumentParser, Namespace - - from omegaconf import OmegaConf as om - - from llmfoundry.utils import build_tokenizer - - def parse_args() -> Namespace: - """Parse commandline arguments.""" - parser = ArgumentParser( - description= - 'Profile packing_ratio choices for a particular workload.') - parser.add_argument( - '--yaml-path', - type=str, - required=True, - help='Path to the YAML that defines the workload to profile.') - parser.add_argument('--num-devices', - type=int, - default=None, - help='How many devices your run will use.') - parser.add_argument('--min', - type=float, - required=True, - help='Smallest packing_ratio to test. Must be >=1.') - parser.add_argument( - '--max', - type=float, - required=True, - help='Largest packing_ratio to test. Must be larger than `min`.') - parser.add_argument( - '--num-packing-ratios', - type=int, - default=10, - help= - 'Number of packing_ratio values (spaced between `min` and `max) to try.' - ) - - args = parser.parse_args() - - if not os.path.isfile(args.yaml_path): - raise FileNotFoundError( - '`yaml_path` does not correspond to any existing file.') - if args.num_devices < 1: - raise ValueError('`num_devices` must be a positive integer.') - if args.min < 1.0: - raise ValueError('`min` must be >=1.0.') - if args.max < args.min: - raise ValueError('`max` cannot be less than `min`.') - if args.num_packing_ratios < 1: - raise ValueError('`num_packing_ratios` must be a positive integer.') - return args - - args = parse_args() - - with open(args.yaml_path) as f: - cfg = om.load(f) - if 'parameters' in cfg: - cfg = om.to_container(cfg.parameters) - cfg = om.create(cfg) - device_batch_size = cfg.global_train_batch_size // args.num_devices - - # Fetch a bunch of raw examples once, which we'll re-use - if 'train_loader' not in cfg: - raise ValueError('config must define train_loader') - dataloader_cfg = cfg.train_loader - - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', - None) - - # build tokenizer - if 'tokenizer' not in cfg: - raise ValueError('config must define tokenizer') - - resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True) - if not isinstance(resolved_tokenizer_cfg, Dict): - raise ValueError( - 'tokenizer config needs to be resolved by omegaconf into a Dict.') - tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg - - tokenizer_name = tokenizer_cfg['name'] - tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, - args.num_packing_ratios, device_batch_size) - - header = '\n\n\n packing_ratio | % PADDING | % WASTE' - fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' - - print(header) - print('-' * len(header)) - for packing_ratio, padding, waste in results: - print(fstr.format(packing_ratio, padding, waste)) diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py new file mode 100644 index 0000000000..a704dbea49 --- /dev/null +++ b/scripts/misc/profile_packing.py @@ -0,0 +1,99 @@ +from typing import Any, Dict + +from llmfoundry.data.packing import profile_packing + + +if __name__ == '__main__': + from argparse import ArgumentParser, Namespace + + from omegaconf import OmegaConf as om + + from llmfoundry.utils import build_tokenizer + + def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Profile packing_ratio choices for a particular workload.') + parser.add_argument( + '--yaml-path', + type=str, + required=True, + help='Path to the YAML that defines the workload to profile.') + parser.add_argument('--num-devices', + type=int, + default=None, + help='How many devices your run will use.') + parser.add_argument('--min', + type=float, + required=True, + help='Smallest packing_ratio to test. Must be >=1.') + parser.add_argument( + '--max', + type=float, + required=True, + help='Largest packing_ratio to test. Must be larger than `min`.') + parser.add_argument( + '--num-packing-ratios', + type=int, + default=10, + help= + 'Number of packing_ratio values (spaced between `min` and `max) to try.' + ) + + args = parser.parse_args() + + if not os.path.isfile(args.yaml_path): + raise FileNotFoundError( + '`yaml_path` does not correspond to any existing file.') + if args.num_devices < 1: + raise ValueError('`num_devices` must be a positive integer.') + if args.min < 1.0: + raise ValueError('`min` must be >=1.0.') + if args.max < args.min: + raise ValueError('`max` cannot be less than `min`.') + if args.num_packing_ratios < 1: + raise ValueError('`num_packing_ratios` must be a positive integer.') + return args + + args = parse_args() + + with open(args.yaml_path) as f: + cfg = om.load(f) + if 'parameters' in cfg: + cfg = om.to_container(cfg.parameters) + cfg = om.create(cfg) + device_batch_size = cfg.global_train_batch_size // args.num_devices + + # Fetch a bunch of raw examples once, which we'll re-use + if 'train_loader' not in cfg: + raise ValueError('config must define train_loader') + dataloader_cfg = cfg.train_loader + + max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', + None) + + # build tokenizer + if 'tokenizer' not in cfg: + raise ValueError('config must define tokenizer') + + resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True) + if not isinstance(resolved_tokenizer_cfg, Dict): + raise ValueError( + 'tokenizer config needs to be resolved by omegaconf into a Dict.') + tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg + + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, + args.num_packing_ratios, device_batch_size) + + header = '\n\n\n packing_ratio | % PADDING | % WASTE' + fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' + + print(header) + print('-' * len(header)) + for packing_ratio, padding, waste in results: + print(fstr.format(packing_ratio, padding, waste)) diff --git a/tests/test_packing.py b/tests/test_packing.py index 16e2466b79..5c5e38b9b6 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,32 +1,143 @@ + + +import os +from typing import Dict, List +from llmfoundry.data.packing import BinPackCollator +from omegaconf import DictConfig +from pytest import approx +import torch +from composer.utils import dist + +from tests.data_utils import make_tiny_ft_dataset + from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader from llmfoundry.utils.builders import build_tokenizer -from omegaconf import DictConfig +def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int, ) -> Dict[str, torch.Tensor]: + input_ids = torch.stack([ + torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) + for d in data + ]) + + attention_mask = torch.stack([ + torch.tensor([1] * len(d) + [pad_token_id] * (max_seq_len - len(d))) + for d in data + ]) + return { 'input_ids': input_ids, 'attention_mask': attention_mask } + +def test_simple_packing(): + pad_token_id = 0 + max_seq_len = 5 + pack = BinPackCollator( + collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side = 'right' + ) + + batch = _data_to_batch([ + [1], + [2] * 2, + [4] * 4, + [3] * 3, + ], max_seq_len, pad_token_id) + + packed_samples = pack(batch) + + assert torch.equal(packed_samples['input_ids'], torch.Tensor([[3,3,3,2,2],[4,4,4,4,1]])) + assert torch.all(packed_samples['attention_mask'] == 1) + + +def test_simple_packing_leftovers(): + pad_token_id = 0 + max_seq_len = 5 + pack = BinPackCollator( + collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side = 'right' + ) + + batch = _data_to_batch([ + [1], + [2] * 2, + [4] * 4, + [4] * 4, + ], max_seq_len, pad_token_id) + + packed_batch = pack(batch) + + assert torch.equal(packed_batch['input_ids'], torch.Tensor([[4,4,4,4,1],[4,4,4,4,0]])) + assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,1,1,1],[1,1,1,1,0]])) + + # Check leftovers and waste. + assert len(pack._leftover_bins) == 1 + leftover_size, leftover = pack._leftover_bins[0] + assert leftover_size == 2 + assert torch.equal(leftover['input_ids'], torch.Tensor([2,2])) + assert torch.equal(leftover['attention_mask'], torch.Tensor([1,1])) + assert pack.waste == approx(2/11) # 2 tokens wasted of 11 tokens total + + # Ensure that leftovers are used in the next batch if possible. + batch = _data_to_batch([[1]], max_seq_len, pad_token_id) + packed_batch = pack(batch) + assert torch.equal(packed_batch['input_ids'], torch.Tensor([[2,2,0,0,0],[1,0,0,0,0]])) + assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,0,0,0],[1,0,0,0,0]])) + +# def test_auto_packing(): +# max_seq_len = 2048 +# tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small') +# tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl') +# if dist.get_global_rank() == 0: +# make_tiny_ft_dataset(path=tiny_dataset_path, size=100) + +# cfg = DictConfig({ +# 'name': 'finetuning', +# 'dataset': { +# 'hf_name': tiny_dataset_folder_path, +# 'split': 'train', +# 'max_seq_len': max_seq_len, +# 'decoder_only_format': True, +# 'allow_pad_trimming': False, +# 'packing_ratio': 'auto', +# 'shuffle': True, +# }, +# 'drop_last': False, +# 'num_workers': 4, +# 'pin_memory': False, +# 'prefetch_factor': 2, +# 'persistent_workers': False, +# 'timeout': 0 +# }) + +# tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) + +# dataloader = build_finetuning_dataloader(cfg, tokenizer, 2) + + +# def test_auto_packing(): +# dataloader_cfg = DictConfig({ +# 'name': 'finetuning', +# 'dataset': { +# 'hf_name': 'mosaicml/dolly_hhrlhf', +# 'split': 'train', +# 'max_seq_len': 1024, +# 'allow_pad_trimming': False, +# 'decoder_only_format': True, +# 'packing_ratio': 'auto', +# 'shuffle': True, +# }, +# 'drop_last': False, +# 'num_workers': 8, +# 'pin_memory': False, +# 'prefetch_factor': 2, +# 'persistent_workers': True, +# 'timeout': 0, +# }) + +# tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) -def test_auto_packing(): - dataloader_cfg = DictConfig({ - 'name': 'finetuning', - 'dataset': { - 'hf_name': 'mosaicml/dolly_hhrlhf', - 'split': 'train', - 'max_seq_len': 1024, - 'allow_pad_trimming': False, - 'decoder_only_format': True, - 'packing_ratio': 'auto', - 'shuffle': True, - }, - 'drop_last': False, - 'num_workers': 8, - 'pin_memory': False, - 'prefetch_factor': 2, - 'persistent_workers': True, - 'timeout': 0, - }) - - tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) - - dataloader = build_finetuning_dataloader(dataloader_cfg, tokenizer, 6) - - for sample in dataloader: - print(sample) \ No newline at end of file +# dataloader = build_finetuning_dataloader(dataloader_cfg, tokenizer, 6) From 93d79264015910a08dfab0a3f018072013cdf599 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 19 Oct 2023 22:44:35 +0000 Subject: [PATCH 05/18] Add auto packing tests --- llmfoundry/data/packing.py | 1 - tests/test_packing.py | 83 ++++++++------------------------------ 2 files changed, 17 insertions(+), 67 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 904c7f39cd..5ae861f3b3 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -284,7 +284,6 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, if waste > 0: break prev_packing_ratio = packing_ratio - print('packing ratio!', packing_ratio) return prev_packing_ratio diff --git a/tests/test_packing.py b/tests/test_packing.py index 5c5e38b9b6..1d4bf5dac0 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,18 +1,11 @@ -import os from typing import Dict, List -from llmfoundry.data.packing import BinPackCollator +from unittest.mock import Mock, patch +from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from omegaconf import DictConfig from pytest import approx import torch -from composer.utils import dist - -from tests.data_utils import make_tiny_ft_dataset - -from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader - -from llmfoundry.utils.builders import build_tokenizer def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int, ) -> Dict[str, torch.Tensor]: input_ids = torch.stack([ @@ -26,7 +19,7 @@ def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int, ) -> Di ]) return { 'input_ids': input_ids, 'attention_mask': attention_mask } -def test_simple_packing(): +def test_packing(): pad_token_id = 0 max_seq_len = 5 pack = BinPackCollator( @@ -49,8 +42,7 @@ def test_simple_packing(): assert torch.equal(packed_samples['input_ids'], torch.Tensor([[3,3,3,2,2],[4,4,4,4,1]])) assert torch.all(packed_samples['attention_mask'] == 1) - -def test_simple_packing_leftovers(): +def test_packing_with_leftovers(): pad_token_id = 0 max_seq_len = 5 pack = BinPackCollator( @@ -87,57 +79,16 @@ def test_simple_packing_leftovers(): assert torch.equal(packed_batch['input_ids'], torch.Tensor([[2,2,0,0,0],[1,0,0,0,0]])) assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,0,0,0],[1,0,0,0,0]])) -# def test_auto_packing(): -# max_seq_len = 2048 -# tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small') -# tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl') -# if dist.get_global_rank() == 0: -# make_tiny_ft_dataset(path=tiny_dataset_path, size=100) - -# cfg = DictConfig({ -# 'name': 'finetuning', -# 'dataset': { -# 'hf_name': tiny_dataset_folder_path, -# 'split': 'train', -# 'max_seq_len': max_seq_len, -# 'decoder_only_format': True, -# 'allow_pad_trimming': False, -# 'packing_ratio': 'auto', -# 'shuffle': True, -# }, -# 'drop_last': False, -# 'num_workers': 4, -# 'pin_memory': False, -# 'prefetch_factor': 2, -# 'persistent_workers': False, -# 'timeout': 0 -# }) - -# tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) - -# dataloader = build_finetuning_dataloader(cfg, tokenizer, 2) - - -# def test_auto_packing(): -# dataloader_cfg = DictConfig({ -# 'name': 'finetuning', -# 'dataset': { -# 'hf_name': 'mosaicml/dolly_hhrlhf', -# 'split': 'train', -# 'max_seq_len': 1024, -# 'allow_pad_trimming': False, -# 'decoder_only_format': True, -# 'packing_ratio': 'auto', -# 'shuffle': True, -# }, -# 'drop_last': False, -# 'num_workers': 8, -# 'pin_memory': False, -# 'prefetch_factor': 2, -# 'persistent_workers': True, -# 'timeout': 0, -# }) - -# tokenizer = build_tokenizer('mosaicml/mpt-7b', {}) - -# dataloader = build_finetuning_dataloader(dataloader_cfg, tokenizer, 6) +@patch('llmfoundry.data.packing.profile_packing') +def test_auto_packing(profile_packing: Mock): + # List of tuples of packing_ratio, padding, waste, sorted by packing ratio + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] + + packing_ratio = auto_packing_ratio( + dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), + tokenizer=None, + device_batch_size=1, + ) # Dummy values, profiling results are already set. + + # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. + assert packing_ratio == 2 From ec71fec0233f8d58614896284a4db023157161f4 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 19 Oct 2023 22:47:00 +0000 Subject: [PATCH 06/18] Add auto packing to test_dataloader --- tests/test_dataloader.py | 6 +++--- tests/test_packing.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index eea887d663..ccb1130da9 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -7,7 +7,7 @@ import sys import tempfile from argparse import Namespace -from typing import Optional +from typing import Any, Optional import pytest import torch @@ -243,10 +243,10 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool, @pytest.mark.parametrize('decoder_only_format', [True, False]) @pytest.mark.parametrize('allow_pad_trimming', [True, False]) -@pytest.mark.parametrize('packing_ratio', [10.0, None]) +@pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto']) def test_finetuning_dataloader(decoder_only_format: bool, allow_pad_trimming: bool, - packing_ratio: Optional[float]): + packing_ratio: Optional[Any]): # Use the datasets just built in the last test tokenizer_name = 'gpt2' if decoder_only_format else 't5-base' max_seq_len = 2048 if decoder_only_format else 1024 diff --git a/tests/test_packing.py b/tests/test_packing.py index 1d4bf5dac0..0808157bc2 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,5 +1,3 @@ - - from typing import Dict, List from unittest.mock import Mock, patch from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio From 0db972a37ba80422447e2a5c66e3023cb43affff Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 19 Oct 2023 23:05:18 +0000 Subject: [PATCH 07/18] use correct max leftovers to keep --- llmfoundry/data/packing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 5ae861f3b3..55d838b04a 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -307,6 +307,9 @@ def profile_packing(dataloader_cfg: DictConfig, import copy from llmfoundry.data.dataloader import build_dataloader + max_seq_len = dataloader_cfg.dataset.get('max_seq_len') + max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', None) + # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) dataloader_cfg.dataset.packing_ratio = None @@ -347,10 +350,10 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: packer = BinPackCollator( collator=lambda x: x, target_batch_size=device_batch_size, - max_seq_len=dataloader_cfg.dataset.max_seq_len, + max_seq_len=max_seq_len, pad_token_id=0, # <-- Doesn't need to be correct for profiling padding_side='left', # <-- Doesn't need to be correct for profiling - max_leftover_bins_to_keep=dataloader_cfg.dataset.max_leftovers_to_keep) + max_leftover_bins_to_keep=max_leftovers_to_keep) # Simulate feeding the packing collator a bunch of data for batch in split_big_batch(raw_batch_size): From 6c321d35b230a1340bd73352b9fb95f35eb603f9 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 20 Oct 2023 00:21:08 +0000 Subject: [PATCH 08/18] Handle dataspec change --- llmfoundry/data/dataloader.py | 11 ++++++++++- llmfoundry/data/packing.py | 4 +++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 0d5f453cf4..9adeabefd6 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -3,6 +3,7 @@ """Dataloader builder utilities.""" +from composer import DataSpec from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase @@ -14,7 +15,15 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int): + device_batch_size: int) -> DataSpec: + """Builds a dataloader from a config. + + Args: + cfg (DictConfig): An omegaconf dictionary used to configure the loader. + tokenizer (PreTrainedTokenizerBase): The tokenizer that the model will use. + device_batch_size (int): The size of the batches (number of examples) + that the dataloader will produce. + """ if cfg.name == 'text': return build_text_dataloader( cfg, diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 55d838b04a..0476db05d2 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple +from composer import DataSpec import numpy as np import torch @@ -330,7 +331,8 @@ def profile_packing(dataloader_cfg: DictConfig, n_profile_examples = max(raw_batch_sizes) * 100 - train_dataloader = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples) + train_dataspec = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples) + train_dataloader = train_dataspec.dataloader # Get a bunch of raw examples big_batch = next(iter(train_dataloader)) From a852c231ba085425200bb25a7466ec3d37d7f9ef Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 20 Oct 2023 17:51:00 +0000 Subject: [PATCH 09/18] Add dataloader test --- tests/test_packing.py | 61 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/tests/test_packing.py b/tests/test_packing.py index 0808157bc2..dff21b1987 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,11 +1,19 @@ -from typing import Dict, List +from typing import Any, Dict, List from unittest.mock import Mock, patch from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from omegaconf import DictConfig from pytest import approx +import pytest import torch -def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int, ) -> Dict[str, torch.Tensor]: +from llmfoundry.utils.builders import build_tokenizer + +from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader + +from composer.utils import reproducibility + +def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]: + """Helper function to create a proper batch of data.""" input_ids = torch.stack([ torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data @@ -18,6 +26,7 @@ def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int, ) -> Di return { 'input_ids': input_ids, 'attention_mask': attention_mask } def test_packing(): + """Tests that packing works for a single batch.""" pad_token_id = 0 max_seq_len = 5 pack = BinPackCollator( @@ -41,6 +50,7 @@ def test_packing(): assert torch.all(packed_samples['attention_mask'] == 1) def test_packing_with_leftovers(): + """Tests that packing handles leftovers and computes waste correctly.""" pad_token_id = 0 max_seq_len = 5 pack = BinPackCollator( @@ -79,6 +89,7 @@ def test_packing_with_leftovers(): @patch('llmfoundry.data.packing.profile_packing') def test_auto_packing(profile_packing: Mock): + """Tests that auto packing select the highest packing ratio with zero waste.""" # List of tuples of packing_ratio, padding, waste, sorted by packing ratio profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] @@ -90,3 +101,49 @@ def test_auto_packing(profile_packing: Mock): # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. assert packing_ratio == 2 + +@pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) +def test_packing_with_dataloader(packing_ratio: Any): + """Tests that packing works with a dataloader.""" + reproducibility.seed_all(17) + tokenizer = build_tokenizer('gpt2', {}) + cfg = DictConfig({ + 'name': 'finetuning', + 'dataset': { + 'hf_name': 'tatsu-lab/alpaca', + 'split': 'train', + 'max_seq_len': 2048, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'packing_ratio': packing_ratio, + 'shuffle': False, + }, + 'drop_last': False, + # Need to test with 0 num_workers because the packing collator object + # Gets copied per worker and we cannot check the waste for child processes. + 'num_workers': 0, + 'pin_memory': False, + 'prefetch_factor': None, + 'persistent_workers': False, + 'timeout': 0, + }) + + loader = build_finetuning_dataloader(cfg, tokenizer, + device_batch_size=6).dataloader + + pack_collator = loader.collate_fn + assert isinstance(pack_collator, BinPackCollator) + + batch_ix = 0 + for _ in loader: + batch_ix += 1 + if batch_ix >= 3: + break + + padding = (1 - pack_collator.efficiency) + if packing_ratio == 'auto': + assert pack_collator.waste == approx(0) + assert padding == approx(0.1197916, rel=.01) + else: + assert pack_collator.waste == approx(0) + assert padding == approx (0.873720, rel=.01) From d48fb97a7c83c2bcb27a06efa1d1ac70997486c9 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 21 Oct 2023 00:33:37 +0000 Subject: [PATCH 10/18] Add distributed autopacking --- llmfoundry/data/packing.py | 17 ++++++++++++----- tests/test_packing.py | 24 +++++++++++++++++++++++- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 0476db05d2..ea73e0f095 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -272,6 +272,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, Returns: A packing ratio that minimizes padding while maintaining zero waste. """ + from composer.utils import dist, get_device min_ratio = 1 max_ratio = dataloader_cfg.dataset.max_seq_len / 100 num_packing_ratios = 20 @@ -280,13 +281,19 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, device_batch_size) # Obtain the maximum packing_ratio/minimum padding that has no waste. - prev_packing_ratio = 1 - for packing_ratio, _, waste in profiling_results: + packing_ratio = 1 + for packing_ratio_candidate, _, waste in profiling_results: if waste > 0: break - prev_packing_ratio = packing_ratio - return prev_packing_ratio - + packing_ratio = packing_ratio_candidate + + # Select the minimum packing ratio across all ranks. + if dist.is_available() and dist.is_initialized(): + device = get_device('gpu') + packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio)) + dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') + packing_ratio = packing_ratio_tensor.item() + return packing_ratio def profile_packing(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, min_ratio: float, diff --git a/tests/test_packing.py b/tests/test_packing.py index dff21b1987..74a16bc41a 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -10,7 +10,7 @@ from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader -from composer.utils import reproducibility +from composer.utils import reproducibility, dist def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]: """Helper function to create a proper batch of data.""" @@ -102,6 +102,28 @@ def test_auto_packing(profile_packing: Mock): # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. assert packing_ratio == 2 +@pytest.mark.world_size(2) +@pytest.mark.gpu +@patch('llmfoundry.data.packing.profile_packing') +def test_dist_auto_packing(profile_packing: Mock): + """Tests that auto packing works with world size > 1.""" + dist.initialize_dist('gpu') + + # List of tuples of packing_ratio, padding, waste, sorted by packing ratio + if dist.get_global_rank() == 0: + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, 0)] # should pick 3 + else: + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] # should pick 2 + + packing_ratio = auto_packing_ratio( + dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), + tokenizer=None, + device_batch_size=1, + ) # Dummy values, profiling results are already set. + + # auto packing ratio should choose 2 because it's the minimum between ranks. + assert packing_ratio == 2 + @pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) def test_packing_with_dataloader(packing_ratio: Any): """Tests that packing works with a dataloader.""" From 8c08405c714e87c9d34f6477e49505f55ee19e4d Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 21 Oct 2023 00:39:51 +0000 Subject: [PATCH 11/18] Update comments for profile_packing script refactor --- llmfoundry/data/denoising.py | 4 ++-- llmfoundry/data/finetuning/dataloader.py | 4 ++-- mcli/mcli-llama2-finetune.yaml | 2 +- scripts/misc/profile_packing.py | 4 ++++ scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml | 2 +- scripts/train/yamls/finetune/1b_local_data_sft.yaml | 2 +- scripts/train/yamls/finetune/7b_dolly_sft.yaml | 2 +- scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml | 2 +- 8 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py index f2830205d4..a83599e799 100644 --- a/llmfoundry/data/denoising.py +++ b/llmfoundry/data/denoising.py @@ -387,7 +387,7 @@ def build_text_denoising_dataloader( packing. Select packing_ratio **carefully** based on the dataset statistics, max_seq_len, and tolerance for discarding samples! - The packing code in `./packing.py` provides a script that can help + The script `scripts/misc/profile_packing.py` can help you choose the best packing_ratio. See :class:`StreamingTextDataset` for info on other standard config options within `cfg.dataset`. @@ -419,7 +419,7 @@ def build_text_denoising_dataloader( that the dataloader will produce. Note: - You can run the script inside `./packing.py` to quickly test the + You can use the script `scripts/misc/profile_packing.py` to quickly test the padding/waste rates for different `cfg.dataset.packing_ratio` choices, given a starting workload YAML. """ diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index ea6e9842f6..79566ea8b4 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -86,7 +86,7 @@ def build_finetuning_dataloader(cfg: DictConfig, packing. Select `packing_ratio` **carefully** based on the dataset statistics, `max_seq_len`, and tolerance for discarding samples! - The packing code in `../packing.py` provides a script that can help + `scripts/misc/profile_packing.py` is a script that can help you choose the best `packing_ratio`. cfg.dataset.shuffle (bool): Whether to shuffle the dataset. ___ @@ -106,7 +106,7 @@ def build_finetuning_dataloader(cfg: DictConfig, A pytorch dataloader Note: - You can run the script inside `../packing.py` to quickly test the + You can run the script inside `scripts/misc/profile_packing.py` to quickly test the padding/waste rates for different `cfg.dataset.packing_ratio` choices, given a starting workload YAML. """ diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index ae8f57abb6..92275a8a92 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -56,7 +56,7 @@ parameters: allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index a704dbea49..83d8d4d91d 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -1,3 +1,7 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Script to profile example packing.""" from typing import Any, Dict from llmfoundry.data.packing import profile_packing diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml index 2c3fb11496..1d936a23dc 100644 --- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml +++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml @@ -41,7 +41,7 @@ train_loader: shuffle: true max_seq_len: ${max_seq_len} decoder_only_format: true - # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml index 45dca2f1e0..fc605441e9 100644 --- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml +++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml @@ -49,7 +49,7 @@ train_loader: &train_loader allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml index 6483dd31f5..6fb843690d 100644 --- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml @@ -41,7 +41,7 @@ train_loader: allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml index 9686317bef..b872a748c8 100644 --- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml @@ -31,7 +31,7 @@ train_loader: max_seq_len: ${max_seq_len} allow_pad_trimming: false decoder_only_format: true - # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: From 6aab1ad93d197138cd28520021e82d2395d8d417 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 21 Oct 2023 03:30:12 +0000 Subject: [PATCH 12/18] add torch cuda check --- llmfoundry/data/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index ea73e0f095..a8d099f3b6 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -288,7 +288,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, packing_ratio = packing_ratio_candidate # Select the minimum packing ratio across all ranks. - if dist.is_available() and dist.is_initialized(): + if torch.cuda.is_available() and dist.is_available() and dist.is_initialized(): device = get_device('gpu') packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio)) dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') From aeffb4b41b4b2c86e1f66df4f1ea9897e2f4125b Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 21 Oct 2023 17:44:58 +0000 Subject: [PATCH 13/18] Use 0 workers for profiling because one batch is loaded per worker and we only load a single batch. --- llmfoundry/data/packing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index a8d099f3b6..55d36527a3 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -323,6 +323,8 @@ def profile_packing(dataloader_cfg: DictConfig, dataloader_cfg.dataset.packing_ratio = None dataloader_cfg.dataset.max_leftovers_to_keep = None dataloader_cfg.drop_last = False + dataloader_cfg.num_workers = 0 + dataloader_cfg.prefetch_factor = None # Determine the packing_ratio values we'll try packing_ratios, raw_batch_sizes = [], [] From 96b4829c439e813ae5e95a57776c08ba9f6385de Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 23 Oct 2023 19:33:44 -0700 Subject: [PATCH 14/18] Fix code quality --- llmfoundry/data/__init__.py | 3 +- llmfoundry/data/dataloader.py | 4 +- llmfoundry/data/packing.py | 42 +++++++----- scripts/misc/profile_packing.py | 2 +- scripts/train/train.py | 4 +- tests/test_dataloader.py | 1 - tests/test_packing.py | 118 +++++++++++++++++++------------- 7 files changed, 99 insertions(+), 75 deletions(-) diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py index 15dc588216..8da436b9b1 100644 --- a/llmfoundry/data/__init__.py +++ b/llmfoundry/data/__init__.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.data.denoising import (MixtureOfDenoisersCollator, build_text_denoising_dataloader) from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator, @@ -9,8 +10,6 @@ from llmfoundry.data.text_data import (StreamingTextDataset, build_text_dataloader) -from llmfoundry.data.dataloader import build_dataloader - __all__ = [ 'MixtureOfDenoisersCollator', 'build_text_denoising_dataloader', diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 9adeabefd6..12741717be 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -7,11 +7,9 @@ from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase -from llmfoundry.data.text_data import build_text_dataloader - from llmfoundry.data.denoising import build_text_denoising_dataloader - from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader +from llmfoundry.data.text_data import build_text_dataloader def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 55d36527a3..b949782e3c 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -2,13 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple -from composer import DataSpec import numpy as np import torch from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase + class BinPackCollator: """Utility collator for packing to reduce padding.""" @@ -57,9 +57,11 @@ def efficiency(self) -> float: def __call__( self, - examples: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: batch = self.base_collator(examples) + return self.pack(batch) + def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: assert 'attention_mask' in batch assert 'input_ids' in batch @@ -93,14 +95,14 @@ def __call__( # Re-pad to max_seq_len and batch batch = _repad(packed_examples, - max_seq_len=self.max_seq_len, - pad_token_id=self.pad_token_id, - padding_side=self.padding_side) + max_seq_len=self.max_seq_len, + pad_token_id=self.pad_token_id, + padding_side=self.padding_side) return batch def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], - idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: + idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: example = {k: v[idx] for k, v in batch.items()} keep = example['attention_mask'] == 1 @@ -225,7 +227,7 @@ def _first_fit_bin_packing( def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int, - pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]: + pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]: def pad_tensor(tensor: torch.Tensor, pad_value: int): if len(tensor) == max_seq_len: @@ -286,19 +288,22 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, if waste > 0: break packing_ratio = packing_ratio_candidate - + # Select the minimum packing ratio across all ranks. - if torch.cuda.is_available() and dist.is_available() and dist.is_initialized(): + if torch.cuda.is_available() and dist.is_available( + ) and dist.is_initialized(): device = get_device('gpu') - packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio)) + packing_ratio_tensor = device.tensor_to_device( + torch.tensor(packing_ratio)) dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') packing_ratio = packing_ratio_tensor.item() return packing_ratio -def profile_packing(dataloader_cfg: DictConfig, - tokenizer: PreTrainedTokenizerBase, min_ratio: float, - max_ratio: float, num_packing_ratios: int, - device_batch_size: int) -> Iterable[Tuple[float, float, float]]: + +def profile_packing( + dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + min_ratio: float, max_ratio: float, num_packing_ratios: int, + device_batch_size: int) -> Iterable[Tuple[float, float, float]]: """Generator function that profiles example packing across packing ratios. Args: @@ -313,10 +318,12 @@ def profile_packing(dataloader_cfg: DictConfig, An iterable of tuples of packing ratio, padding, and waste. """ import copy + from llmfoundry.data.dataloader import build_dataloader max_seq_len = dataloader_cfg.dataset.get('max_seq_len') - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', None) + max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', + None) # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) @@ -340,7 +347,8 @@ def profile_packing(dataloader_cfg: DictConfig, n_profile_examples = max(raw_batch_sizes) * 100 - train_dataspec = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples) + train_dataspec = build_dataloader(dataloader_cfg, tokenizer, + n_profile_examples) train_dataloader = train_dataspec.dataloader # Get a bunch of raw examples @@ -370,7 +378,7 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: for batch in split_big_batch(raw_batch_size): if batch['input_ids'].shape[0] < device_batch_size: continue - _ = packer(batch) + _ = packer.pack(batch) # Return the padding / waste stats over that bunch of data padding_percent = 100 * (1 - packer.efficiency) diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index 83d8d4d91d..5b7d53db76 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 """Script to profile example packing.""" +import os from typing import Any, Dict from llmfoundry.data.packing import profile_packing - if __name__ == '__main__': from argparse import ArgumentParser, Namespace diff --git a/scripts/train/train.py b/scripts/train/train.py index eb32bd04e2..e52c62d3d8 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -24,6 +24,7 @@ from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, MPTForCausalLM) +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.utils.builders import (build_algorithm, build_callback, build_icl_data_and_gauntlet, build_logger, build_optimizer, @@ -32,8 +33,6 @@ process_init_device, update_batch_size_info) -from llmfoundry.data.dataloader import build_dataloader - def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" @@ -167,6 +166,7 @@ def print_trainable_parameters(model: torch.nn.Module) -> None: f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}' ) + def main(cfg: DictConfig) -> Trainer: # Filter deprecation warning from torch internal usage warnings.filterwarnings( diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 2fcea0d0cc..8510ede913 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -8,7 +8,6 @@ import sys import tempfile from argparse import Namespace - from typing import Any, Optional from unittest.mock import MagicMock diff --git a/tests/test_packing.py b/tests/test_packing.py index 74a16bc41a..cbeca8b7b1 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,41 +1,44 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List from unittest.mock import Mock, patch -from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio -from omegaconf import DictConfig -from pytest import approx + import pytest import torch - -from llmfoundry.utils.builders import build_tokenizer +from composer.utils import dist, reproducibility +from omegaconf import DictConfig +from pytest import approx +from torch.utils.data import DataLoader from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader +from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio +from llmfoundry.utils.builders import build_tokenizer -from composer.utils import reproducibility, dist -def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]: +def _data_to_batch(data: List[List[int]], max_seq_len: int, + pad_token_id: int) -> Dict[str, torch.Tensor]: """Helper function to create a proper batch of data.""" input_ids = torch.stack([ - torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) - for d in data + torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data ]) attention_mask = torch.stack([ torch.tensor([1] * len(d) + [pad_token_id] * (max_seq_len - len(d))) for d in data ]) - return { 'input_ids': input_ids, 'attention_mask': attention_mask } + return {'input_ids': input_ids, 'attention_mask': attention_mask} + def test_packing(): """Tests that packing works for a single batch.""" pad_token_id = 0 max_seq_len = 5 - pack = BinPackCollator( - collator=lambda x: x, - target_batch_size=2, - max_seq_len=max_seq_len, - pad_token_id=pad_token_id, - padding_side = 'right' - ) + packer = BinPackCollator(collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side='right') batch = _data_to_batch([ [1], @@ -44,22 +47,22 @@ def test_packing(): [3] * 3, ], max_seq_len, pad_token_id) - packed_samples = pack(batch) + packed_samples = packer.pack(batch) - assert torch.equal(packed_samples['input_ids'], torch.Tensor([[3,3,3,2,2],[4,4,4,4,1]])) + assert torch.equal(packed_samples['input_ids'], + torch.Tensor([[3, 3, 3, 2, 2], [4, 4, 4, 4, 1]])) assert torch.all(packed_samples['attention_mask'] == 1) + def test_packing_with_leftovers(): """Tests that packing handles leftovers and computes waste correctly.""" pad_token_id = 0 max_seq_len = 5 - pack = BinPackCollator( - collator=lambda x: x, - target_batch_size=2, - max_seq_len=max_seq_len, - pad_token_id=pad_token_id, - padding_side = 'right' - ) + packer = BinPackCollator(collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side='right') batch = _data_to_batch([ [1], @@ -68,40 +71,51 @@ def test_packing_with_leftovers(): [4] * 4, ], max_seq_len, pad_token_id) - packed_batch = pack(batch) + packed_batch = packer.pack(batch) - assert torch.equal(packed_batch['input_ids'], torch.Tensor([[4,4,4,4,1],[4,4,4,4,0]])) - assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,1,1,1],[1,1,1,1,0]])) + assert torch.equal(packed_batch['input_ids'], + torch.Tensor([[4, 4, 4, 4, 1], [4, 4, 4, 4, 0]])) + assert torch.equal(packed_batch['attention_mask'], + torch.Tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])) # Check leftovers and waste. - assert len(pack._leftover_bins) == 1 - leftover_size, leftover = pack._leftover_bins[0] + assert len(packer._leftover_bins) == 1 + leftover_size, leftover = packer._leftover_bins[0] assert leftover_size == 2 - assert torch.equal(leftover['input_ids'], torch.Tensor([2,2])) - assert torch.equal(leftover['attention_mask'], torch.Tensor([1,1])) - assert pack.waste == approx(2/11) # 2 tokens wasted of 11 tokens total + assert torch.equal(leftover['input_ids'], torch.Tensor([2, 2])) + assert torch.equal(leftover['attention_mask'], torch.Tensor([1, 1])) + assert packer.waste == approx(2 / 11) # 2 tokens wasted of 11 tokens total # Ensure that leftovers are used in the next batch if possible. batch = _data_to_batch([[1]], max_seq_len, pad_token_id) - packed_batch = pack(batch) - assert torch.equal(packed_batch['input_ids'], torch.Tensor([[2,2,0,0,0],[1,0,0,0,0]])) - assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,0,0,0],[1,0,0,0,0]])) + packed_batch = packer.pack(batch) + assert torch.equal(packed_batch['input_ids'], + torch.Tensor([[2, 2, 0, 0, 0], [1, 0, 0, 0, 0]])) + assert torch.equal(packed_batch['attention_mask'], + torch.Tensor([[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]])) + @patch('llmfoundry.data.packing.profile_packing') def test_auto_packing(profile_packing: Mock): - """Tests that auto packing select the highest packing ratio with zero waste.""" + """Tests that auto packing selects the highest packing ratio with zero. + + waste. + """ # List of tuples of packing_ratio, padding, waste, sorted by packing ratio profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), - tokenizer=None, + dataloader_cfg=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), + tokenizer=None, device_batch_size=1, - ) # Dummy values, profiling results are already set. + ) # Dummy values, profiling results are already set. # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. assert packing_ratio == 2 + @pytest.mark.world_size(2) @pytest.mark.gpu @patch('llmfoundry.data.packing.profile_packing') @@ -111,19 +125,24 @@ def test_dist_auto_packing(profile_packing: Mock): # List of tuples of packing_ratio, padding, waste, sorted by packing ratio if dist.get_global_rank() == 0: - profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, 0)] # should pick 3 + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), + (3, .7, 0)] # should pick 3 else: - profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] # should pick 2 + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), + (3, .7, .5)] # should pick 2 packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), - tokenizer=None, + dataloader_cfg=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), + tokenizer=None, device_batch_size=1, - ) # Dummy values, profiling results are already set. + ) # Dummy values, profiling results are already set. # auto packing ratio should choose 2 because it's the minimum between ranks. assert packing_ratio == 2 + @pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) def test_packing_with_dataloader(packing_ratio: Any): """Tests that packing works with a dataloader.""" @@ -151,8 +170,9 @@ def test_packing_with_dataloader(packing_ratio: Any): }) loader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size=6).dataloader - + device_batch_size=6).dataloader + + assert isinstance(loader, DataLoader) pack_collator = loader.collate_fn assert isinstance(pack_collator, BinPackCollator) @@ -168,4 +188,4 @@ def test_packing_with_dataloader(packing_ratio: Any): assert padding == approx(0.1197916, rel=.01) else: assert pack_collator.waste == approx(0) - assert padding == approx (0.873720, rel=.01) + assert padding == approx(0.873720, rel=.01) From d88cdcc9017df5fd45d72b850add6d969fc270c6 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 2 Nov 2023 15:47:21 -0700 Subject: [PATCH 15/18] Address PR comments --- llmfoundry/data/denoising.py | 8 +- llmfoundry/data/finetuning/dataloader.py | 20 ++- llmfoundry/data/packing.py | 127 +++++++++++++++++- mcli/mcli-llama2-finetune.yaml | 5 +- scripts/misc/profile_packing.py | 6 +- .../mpt-7b-arc-easy--gpu.yaml | 5 +- .../yamls/finetune/1b_local_data_sft.yaml | 5 +- .../train/yamls/finetune/7b_dolly_sft.yaml | 5 +- .../yamls/finetune/mpt-7b_dolly_sft.yaml | 5 +- tests/test_dataloader.py | 5 +- 10 files changed, 167 insertions(+), 24 deletions(-) diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py index a83599e799..7d497b4efd 100644 --- a/llmfoundry/data/denoising.py +++ b/llmfoundry/data/denoising.py @@ -375,12 +375,18 @@ def build_text_denoising_dataloader( cfg.dataset.max_seq_len (int): The maximum length of sequences in the batch. See :class:`MixtureOfDenoisersCollator` docstring for details. - cfg.dataset.packing_ratio (float, optional): If provided, this invokes + cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes a collator wrapper that packs device_batch_size*packing_ratio raw examples into device_batch_size packed examples. This helps minimize padding while preserving sequence integrity. This adds `sequence_id` to the batch, which indicates which unique sequence each token belongs to. + + If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with + zero waste is selected. + In practice, this may result in > 0 waste because profiling is done on only a portion + of the dataset. + Note: Using this feature will not change device_batch_size but it will determine the number of raw examples consumed by the dataloader per batch. Some examples may be discarded if they do not fit when diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 79566ea8b4..1453d51591 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -74,20 +74,26 @@ def build_finetuning_dataloader(cfg: DictConfig, cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow the collator to trim padding. See :class:`Seq2SeqFinetuningCollator` docstring for details. Default: ``False``. - cfg.dataset.packing_ratio (float, optional): If provided, this invokes - a collator wrapper that packs `device_batch_size*packing_ratio` - raw examples into `device_batch_size` packed examples. This helps + cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes + a collator wrapper that packs device_batch_size*packing_ratio + raw examples into device_batch_size packed examples. This helps minimize padding while preserving sequence integrity. This adds `sequence_id` to the batch, which indicates which unique sequence each token belongs to. + + If set to 'auto', packing_ratio is profiled and the highest observed packing ratio with + zero waste is selected. + In practice, this may result in > 0 waste because profiling is done on only a portion + of the dataset. + Note: Using this feature will not change device_batch_size but it will determine the number of raw examples consumed by the dataloader per batch. Some examples may be discarded if they do not fit when packing. - Select `packing_ratio` **carefully** based on the dataset - statistics, `max_seq_len`, and tolerance for discarding samples! - `scripts/misc/profile_packing.py` is a script that can help - you choose the best `packing_ratio`. + Select packing_ratio **carefully** based on the dataset + statistics, max_seq_len, and tolerance for discarding samples! + The script `scripts/misc/profile_packing.py` can help + you choose the best packing_ratio. cfg.dataset.shuffle (bool): Whether to shuffle the dataset. ___ See :class:`StreamingFinetuningDataset` for info on other standard config diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index b949782e3c..ca74fe31bb 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -263,13 +263,24 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): def auto_packing_ratio(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int): + device_batch_size: int, + num_packing_ratios: int = 20) -> int: """Find a packing ratio that minimizes padding with zero waste. + By packing examples, we can increase training efficiency, training on more data with less batches. + However, in practice, the selected packing_ratio may produce some waste because profiling is done on only + a subset of the dataset. + + We select a min_ratio of 1 and a max_ratio that is the max_seq_len / 100, and profile up to + num_packing_ratios packing ratios between min_ratio and max_ratio, inclusive. + When a packing_ratio is found with non-zero waste is found, we stop and select the previous ratio, + which has zero waste. + Args: dataloader_cfg (DictConfig): The dataloader configuration for profiling. tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling. device_batch_size (int): The size of the batches (number of examples) per device. + num_packing_ratio (int): The number of packing ratios to try. Returns: A packing ratio that minimizes padding while maintaining zero waste. @@ -277,12 +288,12 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, from composer.utils import dist, get_device min_ratio = 1 max_ratio = dataloader_cfg.dataset.max_seq_len / 100 - num_packing_ratios = 20 profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, max_ratio, num_packing_ratios, device_batch_size) # Obtain the maximum packing_ratio/minimum padding that has no waste. + # profiling_results are sorted from smallest to largest packing_ratio. packing_ratio = 1 for packing_ratio_candidate, _, waste in profiling_results: if waste > 0: @@ -290,9 +301,8 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, packing_ratio = packing_ratio_candidate # Select the minimum packing ratio across all ranks. - if torch.cuda.is_available() and dist.is_available( - ) and dist.is_initialized(): - device = get_device('gpu') + if dist.is_available() and dist.is_initialized(): + device = get_device(None) packing_ratio_tensor = device.tensor_to_device( torch.tensor(packing_ratio)) dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') @@ -315,7 +325,7 @@ def profile_packing( device_batch_size (int): The size of the batches (number of examples) per device. Returns: - An iterable of tuples of packing ratio, padding, and waste. + An iterable of tuples of packing ratio, padding, and waste, sorted by smallest to largest packing ratio. """ import copy @@ -388,3 +398,108 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): padding, waste = profile(raw_batch_size) yield (packing_ratio, padding, waste) + + +if __name__ == '__main__': + + import warnings + + warnings.warn( + DeprecationWarning( + 'Please use scripts/misc/profile_packing.py to profile packing.' + + 'This script will be removed in later releases.')) + + import os + from argparse import ArgumentParser, Namespace + + from omegaconf import OmegaConf as om + + from llmfoundry.utils import build_tokenizer + + def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Profile packing_ratio choices for a particular workload.') + parser.add_argument( + '--yaml-path', + type=str, + required=True, + help='Path to the YAML that defines the workload to profile.') + parser.add_argument('--num-devices', + type=int, + default=None, + help='How many devices your run will use.') + parser.add_argument('--min', + type=float, + required=True, + help='Smallest packing_ratio to test. Must be >=1.') + parser.add_argument( + '--max', + type=float, + required=True, + help='Largest packing_ratio to test. Must be larger than `min`.') + parser.add_argument( + '--num-packing-ratios', + type=int, + default=20, + help= + 'Number of packing_ratio values (spaced between `min` and `max) to try.' + ) + + args = parser.parse_args() + + if not os.path.isfile(args.yaml_path): + raise FileNotFoundError( + '`yaml_path` does not correspond to any existing file.') + if args.num_devices < 1: + raise ValueError('`num_devices` must be a positive integer.') + if args.min < 1.0: + raise ValueError('`min` must be >=1.0.') + if args.max < args.min: + raise ValueError('`max` cannot be less than `min`.') + if args.num_packing_ratios < 1: + raise ValueError('`num_packing_ratios` must be a positive integer.') + return args + + args = parse_args() + + with open(args.yaml_path) as f: + cfg = om.load(f) + if 'parameters' in cfg: + cfg = om.to_container(cfg.parameters) + cfg = om.create(cfg) + device_batch_size = cfg.global_train_batch_size // args.num_devices + + # Fetch a bunch of raw examples once, which we'll re-use + if 'train_loader' not in cfg: + raise ValueError('config must define train_loader') + dataloader_cfg = cfg.train_loader + + max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', + None) + + # build tokenizer + if 'tokenizer' not in cfg: + raise ValueError('config must define tokenizer') + + resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True) + if not isinstance(resolved_tokenizer_cfg, Dict): + raise ValueError( + 'tokenizer config needs to be resolved by omegaconf into a Dict.') + tokenizer_cfg = resolved_tokenizer_cfg + + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, + args.num_packing_ratios, device_batch_size) + + header = '\n\n\n packing_ratio | % PADDING | % WASTE' + fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' + + print(header) + print('-' * len(header)) + for packing_ratio, padding, waste in results: + print(fstr.format(packing_ratio, padding, waste)) diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 92275a8a92..93d46f57e3 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -56,7 +56,10 @@ parameters: allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index 5b7d53db76..b834b9951a 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -3,7 +3,7 @@ """Script to profile example packing.""" import os -from typing import Any, Dict +from typing import Dict from llmfoundry.data.packing import profile_packing @@ -40,7 +40,7 @@ def parse_args() -> Namespace: parser.add_argument( '--num-packing-ratios', type=int, - default=10, + default=20, help= 'Number of packing_ratio values (spaced between `min` and `max) to try.' ) @@ -85,7 +85,7 @@ def parse_args() -> Namespace: if not isinstance(resolved_tokenizer_cfg, Dict): raise ValueError( 'tokenizer config needs to be resolved by omegaconf into a Dict.') - tokenizer_cfg: Dict[Any, Any] = resolved_tokenizer_cfg + tokenizer_cfg = resolved_tokenizer_cfg tokenizer_name = tokenizer_cfg['name'] tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml index 1d936a23dc..ed2e9fcac0 100644 --- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml +++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml @@ -41,7 +41,10 @@ train_loader: shuffle: true max_seq_len: ${max_seq_len} decoder_only_format: true - # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml index fc605441e9..d6f72b0c8e 100644 --- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml +++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml @@ -49,7 +49,10 @@ train_loader: &train_loader allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml index 6fb843690d..c5813235d9 100644 --- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml @@ -41,7 +41,10 @@ train_loader: allow_pad_trimming: false decoder_only_format: true shuffle: true - # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml index b872a748c8..2f23d8e55a 100644 --- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml @@ -31,7 +31,10 @@ train_loader: max_seq_len: ${max_seq_len} allow_pad_trimming: false decoder_only_format: true - # # Use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length # packing_ratio: diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 8510ede913..2080ec32ec 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -8,7 +8,7 @@ import sys import tempfile from argparse import Namespace -from typing import Any, Optional +from typing import Literal, Optional, Union from unittest.mock import MagicMock import pytest @@ -251,7 +251,8 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool, @pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto']) def test_finetuning_dataloader(decoder_only_format: bool, allow_pad_trimming: bool, - packing_ratio: Optional[Any]): + packing_ratio: Optional[Union[float, + Literal['auto']]]): # Use the datasets just built in the last test tokenizer_name = 'gpt2' if decoder_only_format else 't5-base' max_seq_len = 2048 if decoder_only_format else 1024 From 57cb170b9c5b45dfce1c1db12082d84e4ef11ce0 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 3 Nov 2023 11:42:11 -0700 Subject: [PATCH 16/18] Set random seed for auto packing to make it deterministic --- llmfoundry/data/packing.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index ca74fe31bb..ccb5ec2be2 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -264,7 +264,7 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): def auto_packing_ratio(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, - num_packing_ratios: int = 20) -> int: + num_packing_ratios: int = 20) -> float: """Find a packing ratio that minimizes padding with zero waste. By packing examples, we can increase training efficiency, training on more data with less batches. @@ -285,7 +285,13 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, Returns: A packing ratio that minimizes padding while maintaining zero waste. """ - from composer.utils import dist, get_device + from composer.utils import dist, get_device, reproducibility + + # Stash the rng state to restore later. + rng_state = reproducibility.get_rng_state() + # Set the seed so that auto packing is deterministic. + reproducibility.seed_all(0) + min_ratio = 1 max_ratio = dataloader_cfg.dataset.max_seq_len / 100 profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, @@ -307,6 +313,10 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, torch.tensor(packing_ratio)) dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') packing_ratio = packing_ratio_tensor.item() + + # Restore rng state. + reproducibility.load_rng_state(rng_state) + return packing_ratio From de6b45d57575d2e25d1eac70ab10218777cf3cb5 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Fri, 3 Nov 2023 14:24:10 -0700 Subject: [PATCH 17/18] Fix typo Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/data/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index ccb5ec2be2..e66a707cd9 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -273,7 +273,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, We select a min_ratio of 1 and a max_ratio that is the max_seq_len / 100, and profile up to num_packing_ratios packing ratios between min_ratio and max_ratio, inclusive. - When a packing_ratio is found with non-zero waste is found, we stop and select the previous ratio, + When a packing_ratio with non-zero waste is found, we stop and select the previous ratio, which has zero waste. Args: From 2ff88c2ca9aab530cc9af89fc6aa8405fad1e34e Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 4 Nov 2023 11:54:32 -0700 Subject: [PATCH 18/18] Update max_leftover_bins_to_keep to keep all and remove unused variables --- llmfoundry/data/finetuning/dataloader.py | 11 +++++++---- llmfoundry/data/packing.py | 11 ++--------- scripts/misc/profile_packing.py | 3 --- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 1453d51591..6e988ac149 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -377,16 +377,19 @@ def _build_collate_fn( device_batch_size: int ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: dataset_cfg = dataloader_cfg.dataset + max_seq_len = dataset_cfg.max_seq_len + collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, - max_seq_len=dataset_cfg.max_seq_len, + max_seq_len=max_seq_len, decoder_only_format=dataset_cfg.decoder_only_format, allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False), ) packing_ratio = dataset_cfg.get('packing_ratio') + max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep') if packing_ratio is None: - if dataset_cfg.get('max_leftover_bins_to_keep') is not None: + if max_leftover_bins_to_keep is not None: raise ValueError( 'dataset.max_leftover_bins_to_keep has been defined, ' +\ 'but dataset.packing_ratio has not been set. Please set ' +\ @@ -410,10 +413,10 @@ def _build_collate_fn( collate_fn = BinPackCollator( collator=collate_fn, target_batch_size=device_batch_size, - max_seq_len=dataset_cfg.max_seq_len, + max_seq_len=max_seq_len, pad_token_id=tokenizer.pad_token_id, padding_side=tokenizer.padding_side, - max_leftover_bins_to_keep=dataset_cfg.get('max_leftover_bins_to_keep'), + max_leftover_bins_to_keep=max_leftover_bins_to_keep, ) n_examples_to_pack = int(device_batch_size * packing_ratio) return collate_fn, n_examples_to_pack diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index e66a707cd9..1ae9efcce5 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -32,13 +32,10 @@ def __init__(self, if self.pad_token_id < 0: raise ValueError(f'{pad_token_id=} must be >=0.') - if max_leftover_bins_to_keep is None: - self.max_leftover_bins_to_keep = int(10 * self.out_size) - elif max_leftover_bins_to_keep < 0: + if max_leftover_bins_to_keep is not None and max_leftover_bins_to_keep < 0: raise ValueError( f'{max_leftover_bins_to_keep=} must be >=0 or None.') - else: - self.max_leftover_bins_to_keep = int(max_leftover_bins_to_keep) + self.max_leftover_bins_to_keep = max_leftover_bins_to_keep self.n_packed_tokens = 0 self.n_total_tokens = 0 @@ -348,7 +345,6 @@ def profile_packing( # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) dataloader_cfg.dataset.packing_ratio = None - dataloader_cfg.dataset.max_leftovers_to_keep = None dataloader_cfg.drop_last = False dataloader_cfg.num_workers = 0 dataloader_cfg.prefetch_factor = None @@ -486,9 +482,6 @@ def parse_args() -> Namespace: raise ValueError('config must define train_loader') dataloader_cfg = cfg.train_loader - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', - None) - # build tokenizer if 'tokenizer' not in cfg: raise ValueError('config must define tokenizer') diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index b834b9951a..51841d669e 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -74,9 +74,6 @@ def parse_args() -> Namespace: raise ValueError('config must define train_loader') dataloader_cfg = cfg.train_loader - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', - None) - # build tokenizer if 'tokenizer' not in cfg: raise ValueError('config must define tokenizer')