Skip to content

Commit

Permalink
add chatml tokens to gpt4 tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
samhavens committed Oct 24, 2023
1 parent 091ddca commit b033b9a
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,16 @@ def __init__(self,
self.model_name = model_name
self.encoding_name = encoding_name

if self.model_name is not None:
if self.model_name == 'gpt-4-chatml':
# this is the gpt-4 tokenizer with two additional special tokens
from tiktoken_ext.openai_public import cl100k_base

encoding_kwargs = cl100k_base()
encoding_kwargs['name'] = encoding_kwargs['name'].replace('base', 'chat')
encoding_kwargs['special_tokens']['<|im_start|>'] = 100261
encoding_kwargs['special_tokens']['<|im_end|>'] = 100262
self.encoding = tiktoken.Encoding(**encoding_kwargs)
elif self.model_name is not None:
self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty)
self.model_name)
elif self.encoding_name is not None:
Expand Down

0 comments on commit b033b9a

Please sign in to comment.