From e75de2b6829eb6e5b90732abed896448b754d164 Mon Sep 17 00:00:00 2001
From: Keith Stevens <fozziethebeat@gmail.com>
Date: Fri, 22 Mar 2024 09:11:34 +0000
Subject: [PATCH 1/3] Support loading datasetes saved via save_to_disk

---
 src/axolotl/utils/data.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 9e0049e659..870bb9ca60 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,4 +1,5 @@
 """Module containing data utilities"""
+
 import functools
 import hashlib
 import logging
@@ -290,14 +291,16 @@ def for_d_in_datasets(dataset_configs):
             local_path = Path(config_dataset.path)
             if local_path.exists():
                 if local_path.is_dir():
-                    # TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
-                    ds = load_dataset(
-                        config_dataset.path,
-                        name=config_dataset.name,
-                        data_files=config_dataset.data_files,
-                        streaming=False,
-                        split=None,
-                    )
+                    if config_dataset.data_files:
+                        ds = load_dataset(
+                            config_dataset.path,
+                            name=config_dataset.name,
+                            data_files=config_dataset.data_files,
+                            streaming=False,
+                            split=None,
+                        )
+                    else:
+                        ds = load_from_disk(config_dataset.path)
                 elif local_path.is_file():
                     ds_type = get_ds_type(config_dataset)
 

From f683159a6769f78f8f275bc3ec74b2dc05c000fa Mon Sep 17 00:00:00 2001
From: Keith Stevens <fozziethebeat@gmail.com>
Date: Mon, 25 Mar 2024 05:43:55 +0000
Subject: [PATCH 2/3] Adding comprehensive unittests

---
 src/axolotl/utils/data.py |   5 +-
 tests/test_datasets.py    | 265 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 268 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_datasets.py

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 870bb9ca60..e1aed8941e 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -224,7 +224,7 @@ def for_d_in_datasets(dataset_configs):
                     token=use_auth_token,
                 )
                 ds_from_hub = True
-            except (FileNotFoundError, ConnectionError, HFValidationError):
+            except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
                 pass
 
             ds_from_cloud = False
@@ -292,8 +292,9 @@ def for_d_in_datasets(dataset_configs):
             if local_path.exists():
                 if local_path.is_dir():
                     if config_dataset.data_files:
+                        ds_type = get_ds_type(config_dataset)
                         ds = load_dataset(
-                            config_dataset.path,
+                            ds_type,
                             name=config_dataset.name,
                             data_files=config_dataset.data_files,
                             streaming=False,
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 0000000000..54e663ebe3
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,265 @@
+"""
+Test dataset loading under various conditions.
+"""
+
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+from datasets import Dataset
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from axolotl.utils.data import load_tokenized_prepared_datasets
+from axolotl.utils.dict import DictDefault
+
+
+class TestDatasetPreparation(unittest.TestCase):
+    """Test a configured dataloader."""
+
+    def setUp(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+        # Alpaca dataset.
+        self.dataset = Dataset.from_list(
+            [
+                {
+                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
+                    "input": "He finnished his meal and left the resturant",
+                    "output": "He finished his meal and left the restaurant.",
+                }
+            ]
+        )
+
+    def test_load_hub(self):
+        """Core use case.  Verify that processing data from the hub works"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            prepared_path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 1024,
+                    "datasets": [
+                        {
+                            "path": "mhenrichsen/alpaca_2k_test",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 2000
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_local_hub(self):
+        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
+            tmp_ds_path.mkdir(parents=True, exist_ok=True)
+            snapshot_download(
+                repo_id="mhenrichsen/alpaca_2k_test",
+                repo_type="dataset",
+                local_dir=tmp_ds_path,
+            )
+
+            prepared_path = Path(tmp_dir) / "prepared"
+            # Right now a local copy that doesn't fully conform to a dataset
+            # must list data_files and ds_type otherwise the loader won't know
+            # how to load it.
+            cfg = DictDefault(
+                {
+                    "sequence_len": 1024,
+                    "datasets": [
+                        {
+                            "path": "mhenrichsen/alpaca_2k_test",
+                            "ds_type": "parquet",
+                            "type": "alpaca",
+                            "data_files": [
+                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
+                            ],
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 2000
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+            shutil.rmtree(tmp_ds_path)
+
+    def test_load_from_save_to_disk(self):
+        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
+            self.dataset.save_to_disk(tmp_ds_name)
+
+            prepared_path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_name),
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_dir_of_parquet(self):
+        """Usual use case.  Verify a directory of parquet files can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
+            tmp_ds_dir.mkdir()
+            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
+            self.dataset.to_parquet(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_dir),
+                            "ds_type": "parquet",
+                            "name": "test_data",
+                            "data_files": [
+                                str(tmp_ds_path),
+                            ],
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_dir_of_json(self):
+        """Standard use case.  Verify a directory of json files can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
+            tmp_ds_dir.mkdir()
+            tmp_ds_path = tmp_ds_dir / "shard1.json"
+            self.dataset.to_json(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_dir),
+                            "ds_type": "json",
+                            "name": "test_data",
+                            "data_files": [
+                                str(tmp_ds_path),
+                            ],
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_single_parquet(self):
+        """Standard use case.  Verify a single parquet file can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
+            self.dataset.to_parquet(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_path),
+                            "name": "test_data",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_single_json(self):
+        """Standard use case.  Verify a single json file can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
+            self.dataset.to_json(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_path),
+                            "name": "test_data",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4c80bf1f7b42ae3f37c9c75c7927b11f3d754950 Mon Sep 17 00:00:00 2001
From: Keith Stevens <fozziethebeat@gmail.com>
Date: Wed, 27 Mar 2024 04:54:32 +0000
Subject: [PATCH 3/3] Fix dataset tests due to new hash changes

---
 tests/test_datasets.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 54e663ebe3..8b7b3dae6a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -44,6 +44,7 @@ def test_load_hub(self):
             prepared_path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 1024,
                     "datasets": [
                         {
@@ -80,6 +81,7 @@ def test_load_local_hub(self):
             # how to load it.
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 1024,
                     "datasets": [
                         {
@@ -113,6 +115,7 @@ def test_load_from_save_to_disk(self):
             prepared_path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 256,
                     "datasets": [
                         {
@@ -143,6 +146,7 @@ def test_load_from_dir_of_parquet(self):
             prepared_path: Path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 256,
                     "datasets": [
                         {
@@ -178,6 +182,7 @@ def test_load_from_dir_of_json(self):
             prepared_path: Path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 256,
                     "datasets": [
                         {
@@ -211,6 +216,7 @@ def test_load_from_single_parquet(self):
             prepared_path: Path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 256,
                     "datasets": [
                         {
@@ -240,6 +246,7 @@ def test_load_from_single_json(self):
             prepared_path: Path = Path(tmp_dir) / "prepared"
             cfg = DictDefault(
                 {
+                    "tokenizer_config": "huggyllama/llama-7b",
                     "sequence_len": 256,
                     "datasets": [
                         {