From 48630f5b34c1de005590f481074bcbbedbd9a774 Mon Sep 17 00:00:00 2001 From: MilesQLi Date: Wed, 15 Nov 2023 11:12:32 -0800 Subject: [PATCH] Update data.py for signature generation (#851) * Update data.py Change of conversation formatting type should also trigger updating the preprocessed dataset, so it should be part of the signature. * chore: lint --------- Co-authored-by: Wing Lian --- src/axolotl/utils/data.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 52002a7c35..2af85831ad 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -99,7 +99,12 @@ def load_tokenized_prepared_datasets( str(cfg.sequence_len) + "@" + "|".join( - sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets]) + sorted( + [ + f"{d.path}:{d.type}:{d.shards}:{d.conversation}" + for d in cfg.datasets + ] + ) ) + "|" + tokenizer_name