From 53b0bf727aa08f40cbd85392c1be8663f4d7f8ba Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Tue, 12 Mar 2024 02:08:54 +0900
Subject: [PATCH 1/6] Add a config not to shuffle merged dataset

---
 .../utils/config/models/input/v0_4_1/__init__.py    |  1 +
 src/axolotl/utils/data.py                           | 13 ++++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 79fffe9cd6..34e955f00e 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -414,6 +414,7 @@ class Config:
     dataset_prepared_path: Optional[str] = None
     dataset_shard_num: Optional[int] = None
     dataset_shard_idx: Optional[int] = None
+    not_shuffle_merged_datasets: Optional[bool] = None
 
     pretraining_dataset: Optional[  # type: ignore
         conlist(Union[SFTDataset, PretrainingDataset], min_length=1)
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index ad3a5cb2d8..da41fa6508 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -415,8 +415,11 @@ def for_d_in_datasets(dataset_configs):
         dataset = concatenate_datasets(datasets)
 
         if len(datasets) > 1:
-            LOG.info("shuffle merged datasets")
-            dataset = dataset.shuffle(seed=seed)
+            if cfg.not_shuffle_merged_datasets:
+                LOG.info("NOT shuffling merged datasets")
+            else:
+                LOG.info("shuffle merged datasets")
+                dataset = dataset.shuffle(seed=seed)
 
         dataset, _ = process_datasets_for_packing(cfg, dataset, None)
 
@@ -819,7 +822,11 @@ def wrap_pretraining_dataset(
     else:
         encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
 
-    dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+    if cfg.not_shuffle_merged_datasets:
+        LOG.info("NOT shuffling merged pretraining datasets")
+    else:
+        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+
     dataset = dataset.map(
         encode,
         batched=True,

From 10d19d2b62626b36354b449d5d53a9b24cbebcf6 Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Tue, 12 Mar 2024 02:16:32 +0900
Subject: [PATCH 2/6] Update README.md

---
 README.md                                                | 4 ++++
 src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 590a14faf3..2f7444c71f 100644
--- a/README.md
+++ b/README.md
@@ -662,6 +662,10 @@ datasets:
       # For `completion` datsets only, uses the provided field instead of `text` column
       field:
 
+# If true, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is false.
+not_shuffle_merged_datasets: false
+
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 34e955f00e..4866958af7 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -411,10 +411,10 @@ class Config:
 
     datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
     test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
+    not_shuffle_merged_datasets: Optional[bool] = None
     dataset_prepared_path: Optional[str] = None
     dataset_shard_num: Optional[int] = None
     dataset_shard_idx: Optional[int] = None
-    not_shuffle_merged_datasets: Optional[bool] = None
 
     pretraining_dataset: Optional[  # type: ignore
         conlist(Union[SFTDataset, PretrainingDataset], min_length=1)

From 4e05beb258e969db38e73e9df71c12f21e43af9e Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Wed, 13 Mar 2024 11:15:20 +0900
Subject: [PATCH 3/6] Update
 src/axolotl/utils/config/models/input/v0_4_1/__init__.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 4866958af7..a536fa9ecc 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -411,7 +411,7 @@ class Config:
 
     datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
     test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
-    not_shuffle_merged_datasets: Optional[bool] = None
+    shuffle_merged_datasets: Optional[bool] = True
     dataset_prepared_path: Optional[str] = None
     dataset_shard_num: Optional[int] = None
     dataset_shard_idx: Optional[int] = None

From 59ccd2160024bbc379ddee2c7d06249daed0bf64 Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Wed, 13 Mar 2024 02:19:32 +0000
Subject: [PATCH 4/6] invert the condition name

---
 src/axolotl/utils/data.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index da41fa6508..b7d0b00009 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -415,11 +415,11 @@ def for_d_in_datasets(dataset_configs):
         dataset = concatenate_datasets(datasets)
 
         if len(datasets) > 1:
-            if cfg.not_shuffle_merged_datasets:
-                LOG.info("NOT shuffling merged datasets")
-            else:
-                LOG.info("shuffle merged datasets")
+            if cfg.shuffle_merged_datasets:
+                LOG.debug("shuffle merged datasets")
                 dataset = dataset.shuffle(seed=seed)
+            else:
+                LOG.debug("NOT shuffling merged datasets")
 
         dataset, _ = process_datasets_for_packing(cfg, dataset, None)
 
@@ -822,10 +822,10 @@ def wrap_pretraining_dataset(
     else:
         encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
 
-    if cfg.not_shuffle_merged_datasets:
-        LOG.info("NOT shuffling merged pretraining datasets")
-    else:
+    if cfg.shuffle_merged_datasets:
         dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+    else:
+        LOG.info("NOT shuffling merged pretraining datasets")
 
     dataset = dataset.map(
         encode,

From 75579fb619b8a6c199f9b5af513a81004349ce65 Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Wed, 13 Mar 2024 02:20:40 +0000
Subject: [PATCH 5/6] update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2f7444c71f..99432fa32b 100644
--- a/README.md
+++ b/README.md
@@ -662,9 +662,9 @@ datasets:
       # For `completion` datsets only, uses the provided field instead of `text` column
       field:
 
-# If true, the datasets will not be shuffled and will keep their original order in `datasets`.
-# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is false.
-not_shuffle_merged_datasets: false
+# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: true
 
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.

From abfc7dd575a9d288543c9abeb24a3f28ab1e8142 Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Wed, 13 Mar 2024 02:22:22 +0000
Subject: [PATCH 6/6] info -> debug

---
 src/axolotl/utils/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index b7d0b00009..9e0049e659 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -825,7 +825,7 @@ def wrap_pretraining_dataset(
     if cfg.shuffle_merged_datasets:
         dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
     else:
-        LOG.info("NOT shuffling merged pretraining datasets")
+        LOG.debug("NOT shuffling merged pretraining datasets")
 
     dataset = dataset.map(
         encode,