From bfe1425b34990a092737be0be8473d80175e53b6 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Fri, 8 Dec 2023 11:31:13 +0900 Subject: [PATCH] fix(tokenizer): handle fast tokenizer properly for bos/eos (#914) --- src/axolotl/utils/models.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 40a0a89474..6c77ea4c67 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -92,6 +92,7 @@ def load_tokenizer(cfg): "LlamaTokenizer", "LlamaTokenizerFast", "CodeLlamaTokenizer", + "CodeLlamaTokenizerFast", ] and hasattr(tokenizer, "pad_token") and not tokenizer.pad_token @@ -124,6 +125,23 @@ def load_tokenizer(cfg): tokenizer.add_special_tokens( {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)} ) + + # If we add bos_token and eos_token, we need to update the post processor to + # handle them correctly. + # https://github.com/huggingface/transformers/pull/24132 + bos_or_eos_in_special_tokens = ( + "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens + ) + if ( + tokenizer.__class__.__name__ + in ( + "LlamaTokenizerFast", + "CodeLlamaTokenizerFast", + ) + and bos_or_eos_in_special_tokens + ): + tokenizer.update_post_processor() + if cfg.tokens: tokenizer.add_tokens( [