From d48fb97a7c83c2bcb27a06efa1d1ac70997486c9 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 21 Oct 2023 00:33:37 +0000 Subject: [PATCH] Add distributed autopacking --- llmfoundry/data/packing.py | 17 ++++++++++++----- tests/test_packing.py | 24 +++++++++++++++++++++++- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 0476db05d2..ea73e0f095 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -272,6 +272,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, Returns: A packing ratio that minimizes padding while maintaining zero waste. """ + from composer.utils import dist, get_device min_ratio = 1 max_ratio = dataloader_cfg.dataset.max_seq_len / 100 num_packing_ratios = 20 @@ -280,13 +281,19 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, device_batch_size) # Obtain the maximum packing_ratio/minimum padding that has no waste. - prev_packing_ratio = 1 - for packing_ratio, _, waste in profiling_results: + packing_ratio = 1 + for packing_ratio_candidate, _, waste in profiling_results: if waste > 0: break - prev_packing_ratio = packing_ratio - return prev_packing_ratio - + packing_ratio = packing_ratio_candidate + + # Select the minimum packing ratio across all ranks. + if dist.is_available() and dist.is_initialized(): + device = get_device('gpu') + packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio)) + dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN') + packing_ratio = packing_ratio_tensor.item() + return packing_ratio def profile_packing(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, min_ratio: float, diff --git a/tests/test_packing.py b/tests/test_packing.py index dff21b1987..74a16bc41a 100644 --- a/tests/test_packing.py +++ b/tests/test_packing.py @@ -10,7 +10,7 @@ from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader -from composer.utils import reproducibility +from composer.utils import reproducibility, dist def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]: """Helper function to create a proper batch of data.""" @@ -102,6 +102,28 @@ def test_auto_packing(profile_packing: Mock): # auto packing ratio should choose 2 because packing ratio is maximized while waste is 0. assert packing_ratio == 2 +@pytest.mark.world_size(2) +@pytest.mark.gpu +@patch('llmfoundry.data.packing.profile_packing') +def test_dist_auto_packing(profile_packing: Mock): + """Tests that auto packing works with world size > 1.""" + dist.initialize_dist('gpu') + + # List of tuples of packing_ratio, padding, waste, sorted by packing ratio + if dist.get_global_rank() == 0: + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, 0)] # should pick 3 + else: + profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] # should pick 2 + + packing_ratio = auto_packing_ratio( + dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}), + tokenizer=None, + device_batch_size=1, + ) # Dummy values, profiling results are already set. + + # auto packing ratio should choose 2 because it's the minimum between ranks. + assert packing_ratio == 2 + @pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) def test_packing_with_dataloader(packing_ratio: Any): """Tests that packing works with a dataloader."""