Merge branch 'main' into disable-env

mosaicml · Jan 8, 2024 · 2f3feb9 · 2f3feb9
2 parents ea245b8 + 5b99488
commit 2f3feb9
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 5 deletions.
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -253,7 +253,10 @@ def _convert_token_to_id(self, token: str) -> Optional[int]:
 
     def _convert_id_to_token(self, index: int) -> Optional[str]:
         """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
+        # For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
+        # we return empty string. This matches the behavior of Hugging Face fast tokenizers,
+        # but not slow tokenizers.
+        return self.decoder.get(index, '')
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """Converts a sequence of tokens (string) in a single string."""

diff --git a/llmfoundry/utils/huggingface_hub_utils.py b/llmfoundry/utils/huggingface_hub_utils.py
@@ -59,7 +59,7 @@ def process_file(
     folder_path: str,
     flatten_imports_prefix: Sequence[str],
 ) -> list[str]:
-    with open(file_path, 'r') as f:
+    with open(file_path, 'r', encoding='utf-8') as f:
         source = f.read()
 
     parent_module_name = None
@@ -102,7 +102,7 @@ def process_file(
     if new_filename == '__init__.py':
         new_filename = file_path.split('/')[-2] + '.py'
     new_file_path = os.path.join(folder_path, new_filename)
-    with open(new_file_path, 'w') as f:
+    with open(new_file_path, 'w', encoding='utf-8') as f:
         assert new_tree is not None
         f.write(ast.unparse(new_tree))
 

diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
@@ -338,19 +338,23 @@ def test_additional_special_tokens(model_name: Optional[str],
                                    encoding_name: Optional[str],
                                    tmp_path: pathlib.Path):
     special_token_to_add = '<|im_start|>'
+    input_string = special_token_to_add + ' hello'
     wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
         model_name,
         encoding_name,
         tmp_path,
         add_bos_token=False,
         add_eos_token=False,
         additional_special_tokens=[special_token_to_add])
-    encoded_outputs = wrapped_tokenizer(special_token_to_add +
-                                        ' hello')['input_ids']
+    encoded_outputs = wrapped_tokenizer(input_string)['input_ids']
 
     assert encoded_outputs[0] == wrapped_tokenizer.vocab_size
     assert len(encoded_outputs) == 2
 
+    decoded_outputs = wrapped_tokenizer.decode(
+        encoded_outputs, spaces_between_special_tokens=False)
+    assert decoded_outputs == input_string
+
 
 @pytest.mark.parametrize('model_name,encoding_name',
                          MODEL_ENCODING_NAME_PARAMETRIZATION)
@@ -386,3 +390,15 @@ def test_chat_formatting(model_name: Optional[str],
         chat_str = wrapped_tokenizer.apply_chat_template(
             dict_chats, tokenize=False, add_generation_prompt=True)
         assert chat_str == MULTI_TURN_GENERATE_STRING[i]
+
+
+def test_tiktoken_out_of_range():
+    wrapped_tokenizer = TiktokenTokenizerWrapper(model_name='gpt-4',)
+
+    # For gpt-4, 100256 is less than the vocab size, but is not a valid token
+    assert wrapped_tokenizer.decode([100256]) == ''
+    assert wrapped_tokenizer.decode(100256) == ''
+
+    # For gpt-4, 1000000 is greater than the vocab size
+    assert wrapped_tokenizer.decode([1000000]) == ''
+    assert wrapped_tokenizer.decode(1000000) == ''