From 96b4829c439e813ae5e95a57776c08ba9f6385de Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 23 Oct 2023 19:33:44 -0700 Subject: [PATCH] Fix code quality --- llmfoundry/data/__init__.py | 3 +- llmfoundry/data/dataloader.py | 4 +- llmfoundry/data/packing.py | 42 +++++++----- scripts/misc/profile_packing.py | 2 +- scripts/train/train.py | 4 +- tests/test_dataloader.py | 1 - tests/test_packing.py | 118 +++++++++++++++++++------------- 7 files changed, 99 insertions(+), 75 deletions(-) diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py index 15dc588216..8da436b9b1 100644 --- a/llmfoundry/data/__init__.py +++ b/llmfoundry/data/__init__.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.data.denoising import (MixtureOfDenoisersCollator, build_text_denoising_dataloader) from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator, @@ -9,8 +10,6 @@ from llmfoundry.data.text_data import (StreamingTextDataset, build_text_dataloader) -from llmfoundry.data.dataloader import build_dataloader - __all__ = [ 'MixtureOfDenoisersCollator', 'build_text_denoising_dataloader', diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 9adeabefd6..12741717be 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -7,11 +7,9 @@ from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase -from llmfoundry.data.text_data import build_text_dataloader - from llmfoundry.data.denoising import build_text_denoising_dataloader - from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader +from llmfoundry.data.text_data import build_text_dataloader def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 55d36527a3..b949782e3c 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -2,13 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple -from composer import DataSpec import numpy as np import torch from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase + class BinPackCollator: """Utility collator for packing to reduce padding.""" @@ -57,9 +57,11 @@ def efficiency(self) -> float: def __call__( self, - examples: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: batch = self.base_collator(examples) + return self.pack(batch) + def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: assert 'attention_mask' in batch assert 'input_ids' in batch @@ -93,14 +95,14 @@ def __call__( # Re-pad to max_seq_len and batch batch = _repad(packed_examples, - max_seq_len=self.max_seq_len, - pad_token_id=self.pad_token_id, - padding_side=self.padding_side) + max_seq_len=self.max_seq_len, + pad_token_id=self.pad_token_id, + padding_side=self.padding_side) return batch def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], - idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: + idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: example = {k: v[idx] for k, v in batch.items()} keep = example['attention_mask'] == 1 @@ -225,7 +227,7 @@ def _first_fit_bin_packing( def _repad(packed_examples: List[Dict[str, torch.Tensor]], max_seq_len: int, - pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]: + pad_token_id: int, padding_side: str) -> Dict[str, torch.Tensor]: def pad_tensor(tensor: torch.Tensor, pad_value: int): if len(tensor) == max_seq_len: @@ -286,19 +288,22 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, if waste > 0: break packing_ratio = packing_ratio_candidate - + # Select the minimum packing ratio across all ranks. - if torch.cuda.is_available() and dist.is_available() and dist.is_initialized(): + if torch.cuda.is_available() and dist.is_available( + ) and dist.is_initialized(): device = get_device('gpu') - packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio)) + packing_ratio_tensor = device.tensor_to_device( + torch.tensor(packing_ratio)) dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') packing_ratio = packing_ratio_tensor.item() return packing_ratio -def profile_packing(dataloader_cfg: DictConfig, - tokenizer: PreTrainedTokenizerBase, min_ratio: float, - max_ratio: float, num_packing_ratios: int, - device_batch_size: int) -> Iterable[Tuple[float, float, float]]: + +def profile_packing( + dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + min_ratio: float, max_ratio: float, num_packing_ratios: int, + device_batch_size: int) -> Iterable[Tuple[float, float, float]]: """Generator function that profiles example packing across packing ratios. Args: @@ -313,10 +318,12 @@ def profile_packing(dataloader_cfg: DictConfig, An iterable of tuples of packing ratio, padding, and waste. """ import copy + from llmfoundry.data.dataloader import build_dataloader max_seq_len = dataloader_cfg.dataset.get('max_seq_len') - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', None) + max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', + None) # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg = copy.deepcopy(dataloader_cfg) @@ -340,7 +347,8 @@ def profile_packing(dataloader_cfg: DictConfig, n_profile_examples = max(raw_batch_sizes) * 100 - train_dataspec = build_dataloader(dataloader_cfg, tokenizer, n_profile_examples) + train_dataspec = build_dataloader(dataloader_cfg, tokenizer, + n_profile_examples) train_dataloader = train_dataspec.dataloader # Get a bunch of raw examples @@ -370,7 +378,7 @@ def profile(raw_batch_size: int) -> Tuple[float, float]: for batch in split_big_batch(raw_batch_size): if batch['input_ids'].shape[0] < device_batch_size: continue - _ = packer(batch) + _ = packer.pack(batch) # Return the padding / waste stats over that bunch of data padding_percent = 100 * (1 - packer.efficiency) diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index 83d8d4d91d..5b7d53db76 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 """Script to profile example packing.""" +import os from typing import Any, Dict from llmfoundry.data.packing import profile_packing - if __name__ == '__main__': from argparse import ArgumentParser, Namespace diff --git a/scripts/train/train.py b/scripts/train/train.py index eb32bd04e2..e52c62d3d8 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -24,6 +24,7 @@ from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, MPTForCausalLM) +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.utils.builders import (build_algorithm, build_callback, build_icl_data_and_gauntlet, build_logger, build_optimizer, @@ -32,8 +33,6 @@ process_init_device, update_batch_size_info) -from llmfoundry.data.dataloader import build_dataloader - def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" @@ -167,6 +166,7 @@ def print_trainable_parameters(model: torch.nn.Module) -> None: f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}' ) + def main(cfg: DictConfig) -> Trainer: # Filter deprecation warning from torch internal usage warnings.filterwarnings( diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 2fcea0d0cc..8510ede913 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -8,7 +8,6 @@ import sys import tempfile from argparse import Namespace - from typing import Any, Optional from unittest.mock import MagicMock diff --git a/tests/test_packing.py b/tests/test_packing.py index 74a16bc41a..cbeca8b7b1 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -1,41 +1,44 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List from unittest.mock import Mock, patch -from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio -from omegaconf import DictConfig -from pytest import approx + import pytest import torch - -from llmfoundry.utils.builders import build_tokenizer +from composer.utils import dist, reproducibility +from omegaconf import DictConfig +from pytest import approx +from torch.utils.data import DataLoader from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader +from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio +from llmfoundry.utils.builders import build_tokenizer -from composer.utils import reproducibility, dist -def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]: +def _data_to_batch(data: List[List[int]], max_seq_len: int, + pad_token_id: int) -> Dict[str, torch.Tensor]: """Helper function to create a proper batch of data.""" input_ids = torch.stack([ - torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) - for d in data + torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data ]) attention_mask = torch.stack([ torch.tensor([1] * len(d) + [pad_token_id] * (max_seq_len - len(d))) for d in data ]) - return { 'input_ids': input_ids, 'attention_mask': attention_mask } + return {'input_ids': input_ids, 'attention_mask': attention_mask} + def test_packing(): """Tests that packing works for a single batch.""" pad_token_id = 0 max_seq_len = 5 - pack = BinPackCollator( - collator=lambda x: x, - target_batch_size=2, - max_seq_len=max_seq_len, - pad_token_id=pad_token_id, - padding_side = 'right' - ) + packer = BinPackCollator(collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side='right') batch = _data_to_batch([ [1], @@ -44,22 +47,22 @@ def test_packing(): [3] * 3, ], max_seq_len, pad_token_id) - packed_samples = pack(batch) + packed_samples = packer.pack(batch) - assert torch.equal(packed_samples['input_ids'], torch.Tensor([[3,3,3,2,2],[4,4,4,4,1]])) + assert torch.equal(packed_samples['input_ids'], + torch.Tensor([[3, 3, 3, 2, 2], [4, 4, 4, 4, 1]])) assert torch.all(packed_samples['attention_mask'] == 1) + def test_packing_with_leftovers(): """Tests that packing handles leftovers and computes waste correctly.""" pad_token_id = 0 max_seq_len = 5 - pack = BinPackCollator( - collator=lambda x: x, - target_batch_size=2, - max_seq_len=max_seq_len, - pad_token_id=pad_token_id, - padding_side = 'right' - ) + packer = BinPackCollator(collator=lambda x: x, + target_batch_size=2, + max_seq_len=max_seq_len, + pad_token_id=pad_token_id, + padding_side='right') batch = _data_to_batch([ [1], @@ -68,40 +71,51 @@ def test_packing_with_leftovers(): [4] * 4, ], max_seq_len, pad_token_id) - packed_batch = pack(batch) + packed_batch = packer.pack(batch) - assert torch.equal(packed_batch['input_ids'], torch.Tensor([[4,4,4,4,1],[4,4,4,4,0]])) - assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,1,1,1],[1,1,1,1,0]])) + assert torch.equal(packed_batch['input_ids'], + torch.Tensor([[4, 4, 4, 4, 1], [4, 4, 4, 4, 0]])) + assert torch.equal(packed_batch['attention_mask'], + torch.Tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])) # Check leftovers and waste. - assert len(pack._leftover_bins) == 1 - leftover_size, leftover = pack._leftover_bins[0] + assert len(packer._leftover_bins) == 1 + leftover_size, leftover = packer._leftover_bins[0] assert leftover_size == 2 - assert torch.equal(leftover['input_ids'], torch.Tensor([2,2])) - assert torch.equal(leftover['attention_mask'], torch.Tensor([1,1])) - assert pack.waste == approx(2/11) # 2 tokens wasted of 11 tokens total + assert torch.equal(leftover['input_ids'], torch.Tensor([2, 2])) + assert torch.equal(leftover['attention_mask'], torch.Tensor([1, 1])) + assert packer.waste == approx(2 / 11) # 2 tokens wasted of 11 tokens total # Ensure that leftovers are used in the next batch if possible. batch = _data_to_batch([[1]], max_seq_len, pad_token_id) - packed_batch = pack(batch) - assert torch.equal(packed_batch['input_ids'], torch.Tensor([[2,2,0,0,0],[1,0,0,0,0]])) - assert torch.equal(packed_batch['attention_mask'], torch.Tensor([[1,1,0,0,0],[1,0,0,0,0]])) + packed_batch = packer.pack(batch) + assert torch.equal(packed_batch['input_ids'], + torch.Tensor([[2, 2, 0, 0, 0], [1, 0, 0, 0, 0]])) + assert torch.equal(packed_batch['attention_mask'], + torch.Tensor([[1, 1, 0, 0, 0], [1, 0, 0, 0, 0]])) + @patch('llmfoundry.data.packing.profile_packing') def test_auto_packing(profile_packing: Mock): - """Tests that auto packing select the highest packing ratio with zero waste.""" + """Tests that auto packing selects the highest packing ratio with zero. + + waste. + """ # List of tuples of packing_ratio, padding, waste, sorted by packing ratio profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), - tokenizer=None, + dataloader_cfg=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), + tokenizer=None, device_batch_size=1, - ) # Dummy values, profiling results are already set. + ) # Dummy values, profiling results are already set. # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. assert packing_ratio == 2 + @pytest.mark.world_size(2) @pytest.mark.gpu @patch('llmfoundry.data.packing.profile_packing') @@ -111,19 +125,24 @@ def test_dist_auto_packing(profile_packing: Mock): # List of tuples of packing_ratio, padding, waste, sorted by packing ratio if dist.get_global_rank() == 0: - profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, 0)] # should pick 3 + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), + (3, .7, 0)] # should pick 3 else: - profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] # should pick 2 + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), + (3, .7, .5)] # should pick 2 packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), - tokenizer=None, + dataloader_cfg=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), + tokenizer=None, device_batch_size=1, - ) # Dummy values, profiling results are already set. + ) # Dummy values, profiling results are already set. # auto packing ratio should choose 2 because it's the minimum between ranks. assert packing_ratio == 2 + @pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) def test_packing_with_dataloader(packing_ratio: Any): """Tests that packing works with a dataloader.""" @@ -151,8 +170,9 @@ def test_packing_with_dataloader(packing_ratio: Any): }) loader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size=6).dataloader - + device_batch_size=6).dataloader + + assert isinstance(loader, DataLoader) pack_collator = loader.collate_fn assert isinstance(pack_collator, BinPackCollator) @@ -168,4 +188,4 @@ def test_packing_with_dataloader(packing_ratio: Any): assert padding == approx(0.1197916, rel=.01) else: assert pack_collator.waste == approx(0) - assert padding == approx (0.873720, rel=.01) + assert padding == approx(0.873720, rel=.01)