From 91cf4ee72c6313c7dd95ff84d80b71ad30f0f40b Mon Sep 17 00:00:00 2001 From: Hamel Husain Date: Thu, 8 Feb 2024 15:02:35 -0800 Subject: [PATCH] allow remote data paths (#1278) * allow remote data paths * add docs about public url * only allow https * better docs * better docs --- README.md | 8 ++++++++ src/axolotl/utils/data.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/README.md b/README.md index 8182ad1e58..0bbba4fc6e 100644 --- a/README.md +++ b/README.md @@ -468,6 +468,14 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod dataset: - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs. ... + + # Loading Data From a Public URL + # - URLs must use HTTPS protocol for security reasons, not HTTP. + # - The URL should be a direct link to the file you wish to load. + # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly. + dataset: + - path: https://some.url.com/yourdata.jsonl # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs. + ds_type: json # this is the default, see other options below. ``` - loading diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index cf09350c19..66a9b0a71b 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -336,6 +336,16 @@ def for_d_in_datasets(dataset_configs): split=None, storage_options=storage_options, ) + elif config_dataset.path.startswith("https://"): + ds_type = get_ds_type(config_dataset) + ds = load_dataset( + ds_type, + name=config_dataset.name, + data_files=config_dataset.path, + streaming=False, + split=None, + storage_options=storage_options, + ) else: if isinstance(config_dataset.data_files, str): fp = hf_hub_download(