From 00dce35fb224d38b9e45c333c6c7c7ca5ee346ee Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 18 Sep 2023 00:32:27 +0900
Subject: [PATCH] Feat(data): Allow loading local csv and text (#594)

* Feat(data): Allow loading local csv and text

* chore: update readme for loading data
---
 README.md                 | 8 ++++----
 src/axolotl/utils/data.py | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index dc9953b76d..c9b935f53d 100644
--- a/README.md
+++ b/README.md
@@ -434,10 +434,10 @@ datasets:
   - path: vicgalle/alpaca-gpt4
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-    ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
-    data_files: # path to source data files
-    shards: # number of shards to split data into
-    name: # name of dataset configuration to load
+    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+    data_files: # Optional[str] path to source data files
+    shards: # Optional[int] number of shards to split data into
+    name: # Optional[str] name of dataset configuration to load
 
   # custom user prompt
   - path: repo
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 66d207374e..2dc2d82b28 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -183,6 +183,10 @@ def for_d_in_datasets(dataset_configs):
                         ds_type = "parquet"
                     elif ".arrow" in d.path:
                         ds_type = "arrow"
+                    elif ".csv" in d.path:
+                        ds_type = "csv"
+                    elif ".txt" in d.path:
+                        ds_type = "text"
                     ds = load_dataset(
                         ds_type,
                         name=d.name,