From 00dce35fb224d38b9e45c333c6c7c7ca5ee346ee Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Mon, 18 Sep 2023 00:32:27 +0900 Subject: [PATCH] Feat(data): Allow loading local csv and text (#594) * Feat(data): Allow loading local csv and text * chore: update readme for loading data --- README.md | 8 ++++---- src/axolotl/utils/data.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dc9953b76d..c9b935f53d 100644 --- a/README.md +++ b/README.md @@ -434,10 +434,10 @@ datasets: - path: vicgalle/alpaca-gpt4 # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] type: alpaca # format | format: (chat/instruct) | .load_ - ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file - data_files: # path to source data files - shards: # number of shards to split data into - name: # name of dataset configuration to load + ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file + data_files: # Optional[str] path to source data files + shards: # Optional[int] number of shards to split data into + name: # Optional[str] name of dataset configuration to load # custom user prompt - path: repo diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 66d207374e..2dc2d82b28 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -183,6 +183,10 @@ def for_d_in_datasets(dataset_configs): ds_type = "parquet" elif ".arrow" in d.path: ds_type = "arrow" + elif ".csv" in d.path: + ds_type = "csv" + elif ".txt" in d.path: + ds_type = "text" ds = load_dataset( ds_type, name=d.name,