Fix tiktoken vocab size (#1081)

* fix vocab size bug * more tests
mosaicml · Apr 2, 2024 · caf7fda · caf7fda
1 parent b765b47
commit caf7fda
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 1 deletion.
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -229,7 +229,7 @@ def get_vocab(self) -> Dict[str, int]:
             # Get an index to add and add the item
             vocab_clone[candidate_extra_id] = index_to_add
 
-        return vocab_clone
+        return dict(vocab_clone, **self.added_tokens_encoder)
 
     def _tokenize(self, text: str) -> List[str]:
         """Returns a tokenized string."""

diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
@@ -364,6 +364,24 @@ def test_additional_special_tokens(model_name: Optional[str],
     assert decoded_outputs == input_string
 
 
+def test_additional_special_tokens_len():
+    special_token_to_add = '<|im_start|>'
+    with_special = TiktokenTokenizerWrapper(
+        model_name='gpt-4', additional_special_tokens=[special_token_to_add])
+
+    no_special = TiktokenTokenizerWrapper(model_name='gpt-4',)
+    assert len(with_special.get_vocab()) == len(no_special.get_vocab()) + 1
+
+    ret = with_special.add_special_tokens(
+        {'additional_special_tokens': ['<|im_start|>']})
+    assert ret == 0
+
+    ret = with_special.add_special_tokens(
+        {'additional_special_tokens': ['<|im_end|>']})
+    assert ret == 1
+    assert len(with_special.get_vocab()) == len(no_special.get_vocab()) + 2
+
+
 @pytest.mark.parametrize('model_name,encoding_name',
                          MODEL_ENCODING_NAME_PARAMETRIZATION)
 def test_chat_formatting(model_name: Optional[str],