From 8f04f14ea51229daf04f81e2865666ba95fd86f1 Mon Sep 17 00:00:00 2001 From: Casper Date: Mon, 22 Jan 2024 10:50:52 +0100 Subject: [PATCH 1/2] Add desc to map/filter --- src/axolotl/cli/__init__.py | 5 ++++- src/axolotl/datasets.py | 1 + src/axolotl/utils/data.py | 1 + src/axolotl/utils/trainer.py | 11 ++++++++++- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/axolotl/cli/__init__.py b/src/axolotl/cli/__init__.py index a68377f07c..15da78b094 100644 --- a/src/axolotl/cli/__init__.py +++ b/src/axolotl/cli/__init__.py @@ -410,7 +410,10 @@ def ultra_apply_chatml(sample): # pylint: disable=possibly-unused-variable for i, data_set in enumerate(train_datasets): _type = cfg.datasets[i]["type"] ds_type_fn = locals()[_type] - train_datasets[i] = data_set.map(ds_type_fn) + train_datasets[i] = data_set.map( + ds_type_fn, + desc="Mapping RL Dataset", + ) train_dataset = concatenate_datasets(train_datasets) # eval_dataset = eval_dataset.map(intel_apply_chatml) diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py index 837b0d6749..1f04889c27 100644 --- a/src/axolotl/datasets.py +++ b/src/axolotl/datasets.py @@ -57,6 +57,7 @@ def process(self, dataset): num_proc=num_proc, remove_columns=features, keep_in_memory=self.keep_in_memory, + desc="Tokenizing Prompts", **map_kwargs, ) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 6726f2ad14..2d642405fb 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -782,6 +782,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s # remove all the existing columns after mapping since they end up having # a different length than the encoded/tokenized column remove_columns=dataset.features.keys(), + desc="Encoding Pretraining", ) return dataset diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index b8235d3cf8..b8b9f55c0e 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -128,12 +128,14 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): drop_long, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, + desc="Dropping Long Sequences", ) if eval_dataset: eval_dataset = eval_dataset.filter( drop_long, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, + desc="Dropping Long Sequences", ) if cfg.group_by_length: @@ -141,6 +143,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_length, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, + desc="Packing (Group By Length)", ) if cfg.sample_packing: @@ -148,6 +151,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_position_ids, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, + desc="Packing (Sample Packing)", ) if cfg.eval_sample_packing is not False: if eval_dataset: @@ -155,6 +159,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_position_ids, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, + desc="Packing (Sample Packing)", ) return train_dataset, eval_dataset @@ -163,9 +168,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): def process_pretraining_datasets_for_packing(train_dataset, sequence_len): drop_long = partial(drop_long_seq, sequence_len=sequence_len) - train_dataset = train_dataset.filter(drop_long) + train_dataset = train_dataset.filter( + drop_long, + desc="Dropping Long Sequences", + ) train_dataset = train_dataset.map( add_position_ids, + desc="Packing Pretraining Dataset", ) return train_dataset From efe1fb2dc136182425b625446264707e67c349d8 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 22 Jan 2024 20:39:18 -0500 Subject: [PATCH 2/2] update descriptions --- src/axolotl/utils/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index b8b9f55c0e..24ea64532e 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -143,7 +143,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_length, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, - desc="Packing (Group By Length)", + desc="Group By Length", ) if cfg.sample_packing: @@ -151,7 +151,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_position_ids, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, - desc="Packing (Sample Packing)", + desc="Add position_id column (Sample Packing)", ) if cfg.eval_sample_packing is not False: if eval_dataset: @@ -159,7 +159,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): add_position_ids, num_proc=cfg.dataset_processes, load_from_cache_file=not cfg.is_preprocess, - desc="Packing (Sample Packing)", + desc="Add position_id column (Sample Packing)", ) return train_dataset, eval_dataset @@ -174,7 +174,7 @@ def process_pretraining_datasets_for_packing(train_dataset, sequence_len): ) train_dataset = train_dataset.map( add_position_ids, - desc="Packing Pretraining Dataset", + desc="Add position_id column (Pretraining Sample Packing)", ) return train_dataset