From e75de2b6829eb6e5b90732abed896448b754d164 Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Fri, 22 Mar 2024 09:11:34 +0000 Subject: [PATCH 1/3] Support loading datasetes saved via save_to_disk --- src/axolotl/utils/data.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 9e0049e659..870bb9ca60 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -1,4 +1,5 @@ """Module containing data utilities""" + import functools import hashlib import logging @@ -290,14 +291,16 @@ def for_d_in_datasets(dataset_configs): local_path = Path(config_dataset.path) if local_path.exists(): if local_path.is_dir(): - # TODO dirs with arrow or parquet files could be loaded with `load_from_disk` - ds = load_dataset( - config_dataset.path, - name=config_dataset.name, - data_files=config_dataset.data_files, - streaming=False, - split=None, - ) + if config_dataset.data_files: + ds = load_dataset( + config_dataset.path, + name=config_dataset.name, + data_files=config_dataset.data_files, + streaming=False, + split=None, + ) + else: + ds = load_from_disk(config_dataset.path) elif local_path.is_file(): ds_type = get_ds_type(config_dataset) From f683159a6769f78f8f275bc3ec74b2dc05c000fa Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Mon, 25 Mar 2024 05:43:55 +0000 Subject: [PATCH 2/3] Adding comprehensive unittests --- src/axolotl/utils/data.py | 5 +- tests/test_datasets.py | 265 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 268 insertions(+), 2 deletions(-) create mode 100644 tests/test_datasets.py diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 870bb9ca60..e1aed8941e 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -224,7 +224,7 @@ def for_d_in_datasets(dataset_configs): token=use_auth_token, ) ds_from_hub = True - except (FileNotFoundError, ConnectionError, HFValidationError): + except (FileNotFoundError, ConnectionError, HFValidationError, ValueError): pass ds_from_cloud = False @@ -292,8 +292,9 @@ def for_d_in_datasets(dataset_configs): if local_path.exists(): if local_path.is_dir(): if config_dataset.data_files: + ds_type = get_ds_type(config_dataset) ds = load_dataset( - config_dataset.path, + ds_type, name=config_dataset.name, data_files=config_dataset.data_files, streaming=False, diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000000..54e663ebe3 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,265 @@ +""" +Test dataset loading under various conditions. +""" + +import shutil +import tempfile +import unittest +from pathlib import Path + +from datasets import Dataset +from huggingface_hub import snapshot_download +from transformers import AutoTokenizer + +from axolotl.utils.data import load_tokenized_prepared_datasets +from axolotl.utils.dict import DictDefault + + +class TestDatasetPreparation(unittest.TestCase): + """Test a configured dataloader.""" + + def setUp(self) -> None: + self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") + self.tokenizer.add_special_tokens( + { + "bos_token": "", + "eos_token": "", + "unk_token": "", + } + ) + # Alpaca dataset. + self.dataset = Dataset.from_list( + [ + { + "instruction": "Evaluate this sentence for spelling and grammar mistakes", + "input": "He finnished his meal and left the resturant", + "output": "He finished his meal and left the restaurant.", + } + ] + ) + + def test_load_hub(self): + """Core use case. Verify that processing data from the hub works""" + with tempfile.TemporaryDirectory() as tmp_dir: + prepared_path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 1024, + "datasets": [ + { + "path": "mhenrichsen/alpaca_2k_test", + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 2000 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + def test_load_local_hub(self): + """Niche use case. Verify that a local copy of a hub dataset can be loaded""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_path = Path("mhenrichsen/alpaca_2k_test") + tmp_ds_path.mkdir(parents=True, exist_ok=True) + snapshot_download( + repo_id="mhenrichsen/alpaca_2k_test", + repo_type="dataset", + local_dir=tmp_ds_path, + ) + + prepared_path = Path(tmp_dir) / "prepared" + # Right now a local copy that doesn't fully conform to a dataset + # must list data_files and ds_type otherwise the loader won't know + # how to load it. + cfg = DictDefault( + { + "sequence_len": 1024, + "datasets": [ + { + "path": "mhenrichsen/alpaca_2k_test", + "ds_type": "parquet", + "type": "alpaca", + "data_files": [ + "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet", + ], + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 2000 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + shutil.rmtree(tmp_ds_path) + + def test_load_from_save_to_disk(self): + """Usual use case. Verify datasets saved via `save_to_disk` can be loaded.""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_name = Path(tmp_dir) / "tmp_dataset" + self.dataset.save_to_disk(tmp_ds_name) + + prepared_path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 256, + "datasets": [ + { + "path": str(tmp_ds_name), + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 1 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + def test_load_from_dir_of_parquet(self): + """Usual use case. Verify a directory of parquet files can be loaded.""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_dir = Path(tmp_dir) / "tmp_dataset" + tmp_ds_dir.mkdir() + tmp_ds_path = tmp_ds_dir / "shard1.parquet" + self.dataset.to_parquet(tmp_ds_path) + + prepared_path: Path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 256, + "datasets": [ + { + "path": str(tmp_ds_dir), + "ds_type": "parquet", + "name": "test_data", + "data_files": [ + str(tmp_ds_path), + ], + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 1 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + def test_load_from_dir_of_json(self): + """Standard use case. Verify a directory of json files can be loaded.""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_dir = Path(tmp_dir) / "tmp_dataset" + tmp_ds_dir.mkdir() + tmp_ds_path = tmp_ds_dir / "shard1.json" + self.dataset.to_json(tmp_ds_path) + + prepared_path: Path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 256, + "datasets": [ + { + "path": str(tmp_ds_dir), + "ds_type": "json", + "name": "test_data", + "data_files": [ + str(tmp_ds_path), + ], + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 1 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + def test_load_from_single_parquet(self): + """Standard use case. Verify a single parquet file can be loaded.""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet" + self.dataset.to_parquet(tmp_ds_path) + + prepared_path: Path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 256, + "datasets": [ + { + "path": str(tmp_ds_path), + "name": "test_data", + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 1 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + def test_load_from_single_json(self): + """Standard use case. Verify a single json file can be loaded.""" + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json" + self.dataset.to_json(tmp_ds_path) + + prepared_path: Path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "sequence_len": 256, + "datasets": [ + { + "path": str(tmp_ds_path), + "name": "test_data", + "type": "alpaca", + }, + ], + } + ) + + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) + + assert len(dataset) == 1 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + + +if __name__ == "__main__": + unittest.main() From 4c80bf1f7b42ae3f37c9c75c7927b11f3d754950 Mon Sep 17 00:00:00 2001 From: Keith Stevens Date: Wed, 27 Mar 2024 04:54:32 +0000 Subject: [PATCH 3/3] Fix dataset tests due to new hash changes --- tests/test_datasets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 54e663ebe3..8b7b3dae6a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -44,6 +44,7 @@ def test_load_hub(self): prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { @@ -80,6 +81,7 @@ def test_load_local_hub(self): # how to load it. cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 1024, "datasets": [ { @@ -113,6 +115,7 @@ def test_load_from_save_to_disk(self): prepared_path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -143,6 +146,7 @@ def test_load_from_dir_of_parquet(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -178,6 +182,7 @@ def test_load_from_dir_of_json(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -211,6 +216,7 @@ def test_load_from_single_parquet(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ { @@ -240,6 +246,7 @@ def test_load_from_single_json(self): prepared_path: Path = Path(tmp_dir) / "prepared" cfg = DictDefault( { + "tokenizer_config": "huggyllama/llama-7b", "sequence_len": 256, "datasets": [ {