From 53b0bf727aa08f40cbd85392c1be8663f4d7f8ba Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Tue, 12 Mar 2024 02:08:54 +0900 Subject: [PATCH 1/6] Add a config not to shuffle merged dataset --- .../utils/config/models/input/v0_4_1/__init__.py | 1 + src/axolotl/utils/data.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 79fffe9cd6..34e955f00e 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -414,6 +414,7 @@ class Config: dataset_prepared_path: Optional[str] = None dataset_shard_num: Optional[int] = None dataset_shard_idx: Optional[int] = None + not_shuffle_merged_datasets: Optional[bool] = None pretraining_dataset: Optional[ # type: ignore conlist(Union[SFTDataset, PretrainingDataset], min_length=1) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index ad3a5cb2d8..da41fa6508 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -415,8 +415,11 @@ def for_d_in_datasets(dataset_configs): dataset = concatenate_datasets(datasets) if len(datasets) > 1: - LOG.info("shuffle merged datasets") - dataset = dataset.shuffle(seed=seed) + if cfg.not_shuffle_merged_datasets: + LOG.info("NOT shuffling merged datasets") + else: + LOG.info("shuffle merged datasets") + dataset = dataset.shuffle(seed=seed) dataset, _ = process_datasets_for_packing(cfg, dataset, None) @@ -819,7 +822,11 @@ def wrap_pretraining_dataset( else: encode = functools.partial(encode_pretraining, tokenizer, max_tokens) - dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) + if cfg.not_shuffle_merged_datasets: + LOG.info("NOT shuffling merged pretraining datasets") + else: + dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) + dataset = dataset.map( encode, batched=True, From 10d19d2b62626b36354b449d5d53a9b24cbebcf6 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Tue, 12 Mar 2024 02:16:32 +0900 Subject: [PATCH 2/6] Update README.md --- README.md | 4 ++++ src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 590a14faf3..2f7444c71f 100644 --- a/README.md +++ b/README.md @@ -662,6 +662,10 @@ datasets: # For `completion` datsets only, uses the provided field instead of `text` column field: +# If true, the datasets will not be shuffled and will keep their original order in `datasets`. +# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is false. +not_shuffle_merged_datasets: false + # A list of one or more datasets to eval the model with. # You can use either test_datasets, or val_set_size, but not both. test_datasets: diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 34e955f00e..4866958af7 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -411,10 +411,10 @@ class Config: datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore + not_shuffle_merged_datasets: Optional[bool] = None dataset_prepared_path: Optional[str] = None dataset_shard_num: Optional[int] = None dataset_shard_idx: Optional[int] = None - not_shuffle_merged_datasets: Optional[bool] = None pretraining_dataset: Optional[ # type: ignore conlist(Union[SFTDataset, PretrainingDataset], min_length=1) From 4e05beb258e969db38e73e9df71c12f21e43af9e Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Wed, 13 Mar 2024 11:15:20 +0900 Subject: [PATCH 3/6] Update src/axolotl/utils/config/models/input/v0_4_1/__init__.py Co-authored-by: Wing Lian --- src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 4866958af7..a536fa9ecc 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -411,7 +411,7 @@ class Config: datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore - not_shuffle_merged_datasets: Optional[bool] = None + shuffle_merged_datasets: Optional[bool] = True dataset_prepared_path: Optional[str] = None dataset_shard_num: Optional[int] = None dataset_shard_idx: Optional[int] = None From 59ccd2160024bbc379ddee2c7d06249daed0bf64 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Wed, 13 Mar 2024 02:19:32 +0000 Subject: [PATCH 4/6] invert the condition name --- src/axolotl/utils/data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index da41fa6508..b7d0b00009 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -415,11 +415,11 @@ def for_d_in_datasets(dataset_configs): dataset = concatenate_datasets(datasets) if len(datasets) > 1: - if cfg.not_shuffle_merged_datasets: - LOG.info("NOT shuffling merged datasets") - else: - LOG.info("shuffle merged datasets") + if cfg.shuffle_merged_datasets: + LOG.debug("shuffle merged datasets") dataset = dataset.shuffle(seed=seed) + else: + LOG.debug("NOT shuffling merged datasets") dataset, _ = process_datasets_for_packing(cfg, dataset, None) @@ -822,10 +822,10 @@ def wrap_pretraining_dataset( else: encode = functools.partial(encode_pretraining, tokenizer, max_tokens) - if cfg.not_shuffle_merged_datasets: - LOG.info("NOT shuffling merged pretraining datasets") - else: + if cfg.shuffle_merged_datasets: dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) + else: + LOG.info("NOT shuffling merged pretraining datasets") dataset = dataset.map( encode, From 75579fb619b8a6c199f9b5af513a81004349ce65 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Wed, 13 Mar 2024 02:20:40 +0000 Subject: [PATCH 5/6] update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2f7444c71f..99432fa32b 100644 --- a/README.md +++ b/README.md @@ -662,9 +662,9 @@ datasets: # For `completion` datsets only, uses the provided field instead of `text` column field: -# If true, the datasets will not be shuffled and will keep their original order in `datasets`. -# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is false. -not_shuffle_merged_datasets: false +# If false, the datasets will not be shuffled and will keep their original order in `datasets`. +# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. +shuffle_merged_datasets: true # A list of one or more datasets to eval the model with. # You can use either test_datasets, or val_set_size, but not both. From abfc7dd575a9d288543c9abeb24a3f28ab1e8142 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Wed, 13 Mar 2024 02:22:22 +0000 Subject: [PATCH 6/6] info -> debug --- src/axolotl/utils/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index b7d0b00009..9e0049e659 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -825,7 +825,7 @@ def wrap_pretraining_dataset( if cfg.shuffle_merged_datasets: dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) else: - LOG.info("NOT shuffling merged pretraining datasets") + LOG.debug("NOT shuffling merged pretraining datasets") dataset = dataset.map( encode,