From 5e071b2f33014d86e12f07e2a0b30c8eda0f4853 Mon Sep 17 00:00:00 2001 From: Karl-Johan Alm Date: Wed, 4 Oct 2023 22:41:04 +0900 Subject: [PATCH] pre-commit hook tweaks --- src/axolotl/prompt_strategies/completion.py | 3 ++- src/axolotl/utils/data.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/axolotl/prompt_strategies/completion.py b/src/axolotl/prompt_strategies/completion.py index 51d474c5f9..dff4499bf4 100644 --- a/src/axolotl/prompt_strategies/completion.py +++ b/src/axolotl/prompt_strategies/completion.py @@ -53,7 +53,8 @@ def tokenize_prompt(self, prompt): full_prompt = self._build_full_prompt(instruction, None, None) tokenized_full_prompt = self._tokenize(full_prompt) steps = self.sequence_len - self.overlap_len - if steps < 1: raise ValueError("Sequence length must be greater than overlap length") + if steps < 1: + raise ValueError("Sequence length must be greater than overlap length") for key, val in tokenized_full_prompt.items(): for i in range(0, len(val), steps): diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 4f8ce512dc..8df8f565cf 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -96,7 +96,12 @@ def load_tokenized_prepared_datasets( str(cfg.sequence_len) + "@" + "|".join( - sorted([f"{d.path}:{d.type}:{d.shards}:{d.overlap_len}" for d in cfg.datasets]) + sorted( + [ + f"{d.path}:{d.type}:{d.shards}:{d.overlap_len}" + for d in cfg.datasets + ] + ) ) + "|" + tokenizer_name @@ -395,7 +400,12 @@ def load_prepare_datasets( + str(max_packed_sequence_len) + seed + "|".join( - sorted([f"{d.path}:{d.type}:{d.shards}:{d.overlap_len}" for d in cfg.datasets]) + sorted( + [ + f"{d.path}:{d.type}:{d.shards}:{d.overlap_len}" + for d in cfg.datasets + ] + ) ) + "|" + tokenizer_name