From efe4d65260671914d70c057542c5f64d05b7edb4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 27 Mar 2024 17:57:50 +0000 Subject: [PATCH 1/8] log packing ratio progress --- llmfoundry/data/packing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index fba3ab2d3e..44061e6f8a 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -317,6 +317,8 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, """ from composer.utils import dist, get_device, reproducibility + log.debug('Searching for optimal packing ratio.') + # Stash the rng state to restore later. rng_state = reproducibility.get_rng_state() # Set the seed so that auto packing is deterministic. @@ -352,6 +354,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, # Restore rng state. reproducibility.load_rng_state(rng_state) + log.debug(f'Found packing ratio: {packing_ratio}') return packing_ratio From cf2e17c5705f7caeab502477497861c3b0215350 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 27 Mar 2024 18:38:44 +0000 Subject: [PATCH 2/8] use tqdm to show progress of packing ratio profiling --- llmfoundry/data/packing.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 44061e6f8a..8b20e35da3 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -8,6 +8,8 @@ import numpy as np import torch from omegaconf import DictConfig +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm from transformers import PreTrainedTokenizerBase log = logging.getLogger(__name__) @@ -452,6 +454,10 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: waste_percent = 100 * packer.waste return padding_percent, waste_percent - for packing_ratio, raw_batch_size in zip(packing_ratios, raw_batch_sizes): - padding, waste = profile(raw_batch_size) - yield (packing_ratio, padding, waste) + with logging_redirect_tqdm(loggers=[log]): + for packing_ratio, raw_batch_size in tqdm( + zip(packing_ratios, raw_batch_sizes), + desc='Profiling packing ratios'): + tqdm.set_description_str(f'Profiling packing ratio {packing_ratio}') + padding, waste = profile(raw_batch_size) + yield (packing_ratio, padding, waste) From 83bc841400f2309123c93673e2a75e2a3fc7db8a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 27 Mar 2024 19:30:13 +0000 Subject: [PATCH 3/8] fix pbar --- llmfoundry/data/packing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 8b20e35da3..d1f4924999 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -455,9 +455,9 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: return padding_percent, waste_percent with logging_redirect_tqdm(loggers=[log]): - for packing_ratio, raw_batch_size in tqdm( + for packing_ratio, raw_batch_size in (pbar := tqdm( zip(packing_ratios, raw_batch_sizes), - desc='Profiling packing ratios'): - tqdm.set_description_str(f'Profiling packing ratio {packing_ratio}') + desc='Profiling packing ratios')): + pbar.set_description_str(f'Profiling packing ratio {packing_ratio}') padding, waste = profile(raw_batch_size) yield (packing_ratio, padding, waste) From a5585c0e9b32006c0d007a7815d9879da2a3eaff Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 28 Mar 2024 12:35:48 -0400 Subject: [PATCH 4/8] Update packing.py --- llmfoundry/data/packing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 83d764cecc..ec28ef09c2 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -354,7 +354,6 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, # Restore rng state. reproducibility.load_rng_state(rng_state) - log.debug(f'Found packing ratio: {packing_ratio}') return packing_ratio From 6a7746ec705a8f442d77935462790bd4485b375b Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 2 Apr 2024 21:11:32 +0000 Subject: [PATCH 5/8] added log to search for --- llmfoundry/data/packing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index ec28ef09c2..753281fcb6 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -451,6 +451,7 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: waste_percent = 100 * packer.waste return padding_percent, waste_percent + log.info('Profiling packing ratios') with logging_redirect_tqdm(loggers=[log]): for packing_ratio, raw_batch_size in (pbar := tqdm( zip(packing_ratios, raw_batch_sizes), From 0e6f3bfea163ebc2d1615376a40902c9374c45fd Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 2 Apr 2024 21:39:32 +0000 Subject: [PATCH 6/8] fix --- llmfoundry/data/packing.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 753281fcb6..7f389e00a9 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -3,18 +3,27 @@ import logging import tempfile +from io import StringIO from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple import numpy as np import torch from omegaconf import DictConfig from tqdm import tqdm -from tqdm.contrib.logging import logging_redirect_tqdm from transformers import PreTrainedTokenizerBase log = logging.getLogger(__name__) +class LogIO(StringIO): + + def __init__(self, log: logging.Logger): + self.log = log + + def write(self, message: str): + self.log.debug(message) + + class BinPackCollator: """Utility collator for packing to reduce padding.""" @@ -452,10 +461,11 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: return padding_percent, waste_percent log.info('Profiling packing ratios') - with logging_redirect_tqdm(loggers=[log]): - for packing_ratio, raw_batch_size in (pbar := tqdm( - zip(packing_ratios, raw_batch_sizes), - desc='Profiling packing ratios')): - pbar.set_description_str(f'Profiling packing ratio {packing_ratio}') - padding, waste = profile(raw_batch_size) - yield (packing_ratio, padding, waste) + for packing_ratio, raw_batch_size in (pbar := + tqdm(zip(packing_ratios, + raw_batch_sizes), + desc='Profiling packing ratios', + file=LogIO(log))): + pbar.set_description_str(f'Profiling packing ratio {packing_ratio}') + padding, waste = profile(raw_batch_size) + yield (packing_ratio, padding, waste) From 94b7d579fd72d5d148726c942a48bbfc42965193 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 3 Apr 2024 02:31:15 +0000 Subject: [PATCH 7/8] good old fashioned progress logging --- llmfoundry/data/packing.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 7f389e00a9..c85f7659a0 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -3,7 +3,6 @@ import logging import tempfile -from io import StringIO from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple import numpy as np @@ -15,15 +14,6 @@ log = logging.getLogger(__name__) -class LogIO(StringIO): - - def __init__(self, log: logging.Logger): - self.log = log - - def write(self, message: str): - self.log.debug(message) - - class BinPackCollator: """Utility collator for packing to reduce padding.""" @@ -460,12 +450,12 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: waste_percent = 100 * packer.waste return padding_percent, waste_percent - log.info('Profiling packing ratios') - for packing_ratio, raw_batch_size in (pbar := - tqdm(zip(packing_ratios, - raw_batch_sizes), - desc='Profiling packing ratios', - file=LogIO(log))): - pbar.set_description_str(f'Profiling packing ratio {packing_ratio}') + log.debug('Profiling packing ratios') + total_packing_ratios = min(len(packing_ratios), len(raw_batch_sizes)) + for i, (packing_ratio, + raw_batch_size) in enumerate(zip(packing_ratios, raw_batch_sizes)): + log.debug( + f'Progress [{i}/{total_packing_ratios}]: Profiling packing ratio {packing_ratio}' + ) padding, waste = profile(raw_batch_size) yield (packing_ratio, padding, waste) From f858fda78fb476cc1b63ac69f9baef092b407126 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 3 Apr 2024 11:32:18 -0400 Subject: [PATCH 8/8] Update packing.py --- llmfoundry/data/packing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index c85f7659a0..e875a67820 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -8,7 +8,6 @@ import numpy as np import torch from omegaconf import DictConfig -from tqdm import tqdm from transformers import PreTrainedTokenizerBase log = logging.getLogger(__name__)