Skip to content

Commit

Permalink
fix(preprocess): Make sure dataset not loaded from cache when using p…
Browse files Browse the repository at this point in the history
…reprocess cli (#1136)
  • Loading branch information
NanoCode012 authored Jan 17, 2024
1 parent 7570446 commit 1e56b88
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/axolotl/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,11 @@ def load_tokenized_prepared_datasets(

if dataset:
...
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
elif (
cfg.dataset_prepared_path
and any(prepared_ds_path.glob("*"))
and not cfg.is_preprocess
):
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
dataset = load_from_disk(str(prepared_ds_path))
LOG.info("Prepared dataset loaded from disk...")
Expand Down Expand Up @@ -465,7 +469,11 @@ def load_prepare_datasets(

if dataset:
...
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
elif (
cfg.dataset_prepared_path
and any(prepared_ds_path.glob("*"))
and not cfg.is_preprocess
):
LOG.info(
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
)
Expand Down

0 comments on commit 1e56b88

Please sign in to comment.