huggingface · zucchini-nlp · Nov 4, 2024 · Oct 5, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -51,6 +51,25 @@ token space (e.g., getting the index of the token comprising a given character o
 to a given token).
 
 
+# Multimodal Tokenizer
+
+Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
+as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
+be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. 
+
+To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
+have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
+to three more special tokens.  
+
+```python
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.extra_special_tokens = ["image_token", "boi_token", "eoi_token"]
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-tokenizer.extra_special_tokens = ["image_token", "boi_token", "eoi_token"]
+tokenizer = AutoTokenizer.from_pretrained(model_id, extra_special_tokens = ["image_token", "boi_token", "eoi_token"])
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-tokenizer.extra_special_tokens = ["image_token", "boi_token", "eoi_token"]
+tokenizer = AutoTokenizer.from_pretrained(model_id, extra_special_tokens = ["image_token", "boi_token", "eoi_token"])
+tokenizer.save_pretrained(output_dir)
+
+vision_tokenizer = AutoTokenizer.save_pretrained(output_dir)
+vision_tokenizer.image_token = "IMAGE"
+```
+
 ## PreTrainedTokenizer
 
 [[autodoc]] PreTrainedTokenizer

@@ -1400,7 +1400,6 @@ def __init__(self, **kwargs):
         self.init_kwargs = copy.deepcopy(kwargs)
         self.name_or_path = kwargs.pop("name_or_path", "")
         self._processor_class = kwargs.pop("processor_class", None)
-        self.is_multimodal = kwargs.pop("is_multimodal", False)
 
         # For backward compatibility we fallback to set model_max_length from max_len if provided
         model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1440,9 +1439,8 @@ def __init__(self, **kwargs):
 
         super().__init__(**kwargs)
 
-        if self.is_multimodal:
-            extra_special_tokens = ["image_token", "video_token", "boi_token", "eoi_token", "image_boundary_token"]
-            self._set_model_specific_special_tokens(special_tokens=extra_special_tokens)
+        self.extra_special_tokens = kwargs.pop("extra_special_tokens", [])
+        self._set_model_specific_special_tokens(special_tokens=self.extra_special_tokens)
 
     @property
     def max_len_single_sentence(self) -> int:
@@ -2404,8 +2402,8 @@ def save_pretrained(
 
         # Let's make sure we properly save the special tokens and flag whether it is a multimodal tokenizer.
         tokenizer_config.update(self.special_tokens_map)
-        if self.is_multimodal and "is_multimodal" not in tokenizer_config:
-            tokenizer_config["is_multimodal"] = True
+        if "extra_special_tokens" not in tokenizer_config:
+            tokenizer_config["extra_special_tokens"] = self.extra_special_tokens
 
         if self.chat_template is not None:
             if isinstance(self.chat_template, dict):

@@ -281,7 +281,7 @@ def test_decoding_single_token(self):
                 self.assertEqual(decoded_flat, "##：")
                 self.assertEqual(decoded_list, "##：")
 
-    def test_extra_sepcial_tokens_multimodal(self):
+    def test_extra_special_tokens_multimodal(self):
         special_tokens_list = [
             "bos_token",
             "eos_token",
@@ -293,7 +293,7 @@ def test_extra_sepcial_tokens_multimodal(self):
             "additional_special_tokens",
         ]
         llama_tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
-        llama_tokenizer.is_multimodal = True
+        llama_tokenizer.extra_special_tokens = ["image_token", "boi_token", "eoi_token"]
         self.assertListEqual(llama_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, special_tokens_list)
         with tempfile.TemporaryDirectory() as tmpdirname:
             llama_tokenizer.save_pretrained(tmpdirname)