Skip to content

Commit

Permalink
additional logging to get maximum token length of a sequence in the d…
Browse files Browse the repository at this point in the history
…ataset (#1066) [skip ci]

* additional logging to get maximum token length of a sequence in the dataset

* fix ordering to properly determine the max_len of tokens before dropping anything longer
  • Loading branch information
winglian authored Jan 10, 2024
1 parent 0ce1a65 commit 2f2582e
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions src/axolotl/utils/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,6 @@ def disable_datasets_caching():
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
with zero_first(is_main_process()):
train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
if eval_dataset:
eval_dataset = eval_dataset.filter(
drop_long, num_proc=cfg.dataset_processes
)

if cfg.group_by_length:
train_dataset = train_dataset.map(
add_length, num_proc=cfg.dataset_processes
Expand All @@ -130,6 +124,16 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
add_position_ids, num_proc=cfg.dataset_processes
)

if cfg.group_by_length or cfg.sample_packing:
max_input_len = np.max(get_dataset_lengths(train_dataset))
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)

train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
if eval_dataset:
eval_dataset = eval_dataset.filter(
drop_long, num_proc=cfg.dataset_processes
)

# Phi doesn't want the attention_mask feature when training
if (
"CodeGenTokenizer" in tokenizer.__class__.__name__
Expand Down

0 comments on commit 2f2582e

Please sign in to comment.