From b033b9a4aaee799642de97a6bfbe0da1cfc6918f Mon Sep 17 00:00:00 2001 From: Sam Havens Date: Mon, 23 Oct 2023 17:38:23 -0700 Subject: [PATCH] add chatml tokens to gpt4 tokenizer --- llmfoundry/tokenizers/tiktoken.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 41518a582a..d898ca2ec7 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -57,7 +57,16 @@ def __init__(self, self.model_name = model_name self.encoding_name = encoding_name - if self.model_name is not None: + if self.model_name == 'gpt-4-chatml': + # this is the gpt-4 tokenizer with two additional special tokens + from tiktoken_ext.openai_public import cl100k_base + + encoding_kwargs = cl100k_base() + encoding_kwargs['name'] = encoding_kwargs['name'].replace('base', 'chat') + encoding_kwargs['special_tokens']['<|im_start|>'] = 100261 + encoding_kwargs['special_tokens']['<|im_end|>'] = 100262 + self.encoding = tiktoken.Encoding(**encoding_kwargs) + elif self.model_name is not None: self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty) self.model_name) elif self.encoding_name is not None: