From 54fca299f58ca51ca0febd21440297aaed751193 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 5 Dec 2023 01:16:18 +0900 Subject: [PATCH] fix(tokenizer): handle fast tokenizer properly for bos/eos --- src/axolotl/utils/models.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index acc6f41fa6..ebe77c8a88 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -69,6 +69,7 @@ def load_tokenizer(cfg): "LlamaTokenizer", "LlamaTokenizerFast", "CodeLlamaTokenizer", + "CodeLlamaTokenizerFast", ] and hasattr(tokenizer, "pad_token") and not tokenizer.pad_token @@ -101,6 +102,23 @@ def load_tokenizer(cfg): tokenizer.add_special_tokens( {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)} ) + + # If we add bos_token and eos_token, we need to update the post processor to + # handle them correctly. + # https://github.com/huggingface/transformers/pull/24132 + bos_or_eos_in_special_tokens = ( + "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens + ) + if ( + tokenizer.__class__.__name__ + in ( + "LlamaTokenizerFast", + "CodeLlamaTokenizerFast", + ) + and bos_or_eos_in_special_tokens + ): + tokenizer.update_post_processor() + if cfg.tokens: tokenizer.add_tokens( [