From 799779f83eecc96599b84e95438e5eeb317e8efa Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 19 Jan 2024 23:40:02 -0500 Subject: [PATCH] raise exception sooner if not pre-processed before training --- src/axolotl/utils/data.py | 4 ++++ src/axolotl/utils/trainer.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index a0fd3ea1a8..484546db5d 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -160,6 +160,10 @@ def load_tokenized_prepared_datasets( else: LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}") LOG.info("Loading raw datasets...") + if not cfg.is_preprocess: + raise RuntimeWarning( + "Processing datasets during training can lead to VRAM instability. Please pre-process your dataset" + ) if cfg.seed: seed = cfg.seed diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 6d2f08c8ec..9a9eeab4b9 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -107,10 +107,6 @@ def drop_long_seq(sample, sequence_len=2048): def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): - if cfg.is_preprocess: - LOG.warning( - "Processing datasets during training can lead to VRAM instability. Please pre-process your dataset" - ) drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len) with zero_first(is_main_process()): if cfg.group_by_length: