Skip to content

Commit

Permalink
Fix tiktoken vocab size (#1081)
Browse files Browse the repository at this point in the history
* fix vocab size bug

* more tests
  • Loading branch information
dakinggg authored Apr 2, 2024
1 parent b765b47 commit caf7fda
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def get_vocab(self) -> Dict[str, int]:
# Get an index to add and add the item
vocab_clone[candidate_extra_id] = index_to_add

return vocab_clone
return dict(vocab_clone, **self.added_tokens_encoder)

def _tokenize(self, text: str) -> List[str]:
"""Returns a tokenized string."""
Expand Down
18 changes: 18 additions & 0 deletions tests/tokenizers/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,24 @@ def test_additional_special_tokens(model_name: Optional[str],
assert decoded_outputs == input_string


def test_additional_special_tokens_len():
special_token_to_add = '<|im_start|>'
with_special = TiktokenTokenizerWrapper(
model_name='gpt-4', additional_special_tokens=[special_token_to_add])

no_special = TiktokenTokenizerWrapper(model_name='gpt-4',)
assert len(with_special.get_vocab()) == len(no_special.get_vocab()) + 1

ret = with_special.add_special_tokens(
{'additional_special_tokens': ['<|im_start|>']})
assert ret == 0

ret = with_special.add_special_tokens(
{'additional_special_tokens': ['<|im_end|>']})
assert ret == 1
assert len(with_special.get_vocab()) == len(no_special.get_vocab()) + 2


@pytest.mark.parametrize('model_name,encoding_name',
MODEL_ENCODING_NAME_PARAMETRIZATION)
def test_chat_formatting(model_name: Optional[str],
Expand Down

0 comments on commit caf7fda

Please sign in to comment.