From 2f2582e6eda62384cb878d78faba482d39cf9f78 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 10 Jan 2024 00:49:31 -0500 Subject: [PATCH] additional logging to get maximum token length of a sequence in the dataset (#1066) [skip ci] * additional logging to get maximum token length of a sequence in the dataset * fix ordering to properly determine the max_len of tokens before dropping anything longer --- src/axolotl/utils/trainer.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 17806de658..5588e768fb 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -109,12 +109,6 @@ def disable_datasets_caching(): def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len) with zero_first(is_main_process()): - train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes) - if eval_dataset: - eval_dataset = eval_dataset.filter( - drop_long, num_proc=cfg.dataset_processes - ) - if cfg.group_by_length: train_dataset = train_dataset.map( add_length, num_proc=cfg.dataset_processes @@ -130,6 +124,16 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_position_ids, num_proc=cfg.dataset_processes ) + if cfg.group_by_length or cfg.sample_packing: + max_input_len = np.max(get_dataset_lengths(train_dataset)) + LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True) + + train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes) + if eval_dataset: + eval_dataset = eval_dataset.filter( + drop_long, num_proc=cfg.dataset_processes + ) + # Phi doesn't want the attention_mask feature when training if ( "CodeGenTokenizer" in tokenizer.__class__.__name__