add chatml tokens to gpt4 tokenizer

mosaicml · Oct 24, 2023 · b033b9a · b033b9a
1 parent 091ddca
commit b033b9a
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -57,7 +57,16 @@ def __init__(self,
         self.model_name = model_name
         self.encoding_name = encoding_name
 
-        if self.model_name is not None:
+        if self.model_name == 'gpt-4-chatml':
+            # this is the gpt-4 tokenizer with two additional special tokens
+            from tiktoken_ext.openai_public import cl100k_base
+
+            encoding_kwargs = cl100k_base()
+            encoding_kwargs['name'] = encoding_kwargs['name'].replace('base', 'chat')
+            encoding_kwargs['special_tokens']['<|im_start|>'] = 100261
+            encoding_kwargs['special_tokens']['<|im_end|>'] = 100262
+            self.encoding = tiktoken.Encoding(**encoding_kwargs)
+        elif self.model_name is not None:
             self.encoding = tiktoken.encoding_for_model(  # type: ignore (thirdParty)
                 self.model_name)
         elif self.encoding_name is not None: