Skip to content

Commit

Permalink
Add distributed autopacking
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea committed Oct 21, 2023
1 parent a852c23 commit d48fb97
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 6 deletions.
17 changes: 12 additions & 5 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,
Returns:
A packing ratio that minimizes padding while maintaining zero waste.
"""
from composer.utils import dist, get_device
min_ratio = 1
max_ratio = dataloader_cfg.dataset.max_seq_len / 100
num_packing_ratios = 20
Expand All @@ -280,13 +281,19 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,
device_batch_size)

# Obtain the maximum packing_ratio/minimum padding that has no waste.
prev_packing_ratio = 1
for packing_ratio, _, waste in profiling_results:
packing_ratio = 1
for packing_ratio_candidate, _, waste in profiling_results:
if waste > 0:
break
prev_packing_ratio = packing_ratio
return prev_packing_ratio

packing_ratio = packing_ratio_candidate

# Select the minimum packing ratio across all ranks.
if dist.is_available() and dist.is_initialized():
device = get_device('gpu')
packing_ratio_tensor = device.tensor_to_device(torch.tensor(packing_ratio))
dist.all_reduce(packing_ratio_tensor, reduce_operation='MIN')
packing_ratio = packing_ratio_tensor.item()
return packing_ratio

def profile_packing(dataloader_cfg: DictConfig,
tokenizer: PreTrainedTokenizerBase, min_ratio: float,
Expand Down
24 changes: 23 additions & 1 deletion tests/test_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader

from composer.utils import reproducibility
from composer.utils import reproducibility, dist

def _data_to_batch(data: List[int], max_seq_len: int, pad_token_id: int) -> Dict[str, torch.Tensor]:
"""Helper function to create a proper batch of data."""
Expand Down Expand Up @@ -102,6 +102,28 @@ def test_auto_packing(profile_packing: Mock):
# auto packing ratio should choose 2 because packing ratio is maximized while waste is 0.
assert packing_ratio == 2

@pytest.mark.world_size(2)
@pytest.mark.gpu
@patch('llmfoundry.data.packing.profile_packing')
def test_dist_auto_packing(profile_packing: Mock):
"""Tests that auto packing works with world size > 1."""
dist.initialize_dist('gpu')

# List of tuples of packing_ratio, padding, waste, sorted by packing ratio
if dist.get_global_rank() == 0:
profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, 0)] # should pick 3
else:
profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] # should pick 2

packing_ratio = auto_packing_ratio(
dataloader_cfg=DictConfig({'dataset': {'max_seq_len': 2048 }}),
tokenizer=None,
device_batch_size=1,
) # Dummy values, profiling results are already set.

# auto packing ratio should choose 2 because it's the minimum between ranks.
assert packing_ratio == 2

@pytest.mark.parametrize('packing_ratio', ['auto', 2.0])
def test_packing_with_dataloader(packing_ratio: Any):
"""Tests that packing works with a dataloader."""
Expand Down

0 comments on commit d48fb97

Please sign in to comment.