From f2d48d2e0fc4ede66ea8a63301277d3db3b92766 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 18 Jan 2024 01:40:40 +0900 Subject: [PATCH] fix(preprocess): Make sure dataset not loaded from cache when using preprocess cli --- src/axolotl/utils/data.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index d65e19ab4f..15ae8d5a56 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -152,7 +152,11 @@ def load_tokenized_prepared_datasets( if dataset: ... - elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")): + elif ( + cfg.dataset_prepared_path + and any(prepared_ds_path.glob("*")) + and not cfg.is_preprocess + ): LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...") dataset = load_from_disk(str(prepared_ds_path)) LOG.info("Prepared dataset loaded from disk...") @@ -465,7 +469,11 @@ def load_prepare_datasets( if dataset: ... - elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")): + elif ( + cfg.dataset_prepared_path + and any(prepared_ds_path.glob("*")) + and not cfg.is_preprocess + ): LOG.info( f"Loading prepared packed dataset from disk at {prepared_ds_path}..." )