Skip to content

Commit

Permalink
Fix setting correct repo id when pushing dataset to hub (#1657)
Browse files Browse the repository at this point in the history
* use the ds hash as the dataset's config_name

* improve logging for loading/pushing ds to hub

* fix missing f string
  • Loading branch information
chrislee973 authored Aug 5, 2024
1 parent 203816f commit 7402eb9
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/axolotl/utils/data/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,12 @@ def load_tokenized_prepared_datasets(
use_auth_token = cfg.hf_use_auth_token
try:
if cfg.push_dataset_to_hub:
LOG.info(
f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
)
dataset = load_dataset(
f"{cfg.push_dataset_to_hub}/{ds_hash}",
cfg.push_dataset_to_hub,
ds_hash,
token=use_auth_token,
)
dataset = dataset[split]
Expand All @@ -181,6 +185,8 @@ def load_tokenized_prepared_datasets(
dataset = load_from_disk(str(prepared_ds_path))
LOG.info("Prepared dataset loaded from disk...")
else:
if cfg.push_dataset_to_hub:
LOG.info("Unable to find prepared dataset in Huggingface hub")
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
LOG.info("Loading raw datasets...")
if not cfg.is_preprocess:
Expand Down Expand Up @@ -433,10 +439,12 @@ def for_d_in_datasets(dataset_configs):
dataset.save_to_disk(str(prepared_ds_path))
if cfg.push_dataset_to_hub:
LOG.info(
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
)
dataset.push_to_hub(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
cfg.push_dataset_to_hub,
ds_hash,
private=True,
)

return dataset, prompters
Expand Down

0 comments on commit 7402eb9

Please sign in to comment.