keyboardAnt · keyboardAnt · Dec 17, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 9, 2024
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -24,7 +24,6 @@
 from ..cache_utils import DynamicCache
 from ..pytorch_utils import isin_mps_friendly
 from .logits_process import (
-    LogitNormalization,
     LogitsProcessorList,
     MinLengthLogitsProcessor,
     SuppressTokensLogitsProcessor,
@@ -565,34 +564,44 @@ class AssistantToTargetTranslator:
     Translate the assistant into the target universe.
     """
 
-    def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"):
+    def __init__(
+        self,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        assistant_model_device,
+        target_vocab_size: int,
+        assistant_vocab_size: int,
+    ):
         self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
-        self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
-        self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self._assistant_model_device = assistant_model_device
+        self.target_vocab_size: int = target_vocab_size
+        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
         self.logits_processors: LogitsProcessorList = LogitsProcessorList(
             [
-                SuppressTokensLogitsProcessor(self.suppress_input_ids),
-                LogitNormalization(),
+                SuppressTokensLogitsProcessor(
+                    self._get_mapped_input_ids(), assistant_vocab_size, self._assistant_model_device
+                )
             ]
         )
 
-    def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
-        """
-        Get a mapping from assistant tokens to target tokens based on vocabularies.
-        """
+    def _get_assistant_to_target_input_ids(self):
         target_vocab = self._target_tokenizer.get_vocab()
         assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return {
-            assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
-        }
-
-    def _get_suppress_input_ids(self) -> list[int]:
+        max_assistant_index = max(assistant_vocab.values())
+        assistant_to_target_input_ids = torch.full(
+            (max_assistant_index + 1,), -1, dtype=int
+        )  # -1 means not in target vocab
+        for tok, idx in assistant_vocab.items():
+            if tok in target_vocab:
+                assistant_to_target_input_ids[idx] = target_vocab[tok]
+        return assistant_to_target_input_ids.to(self._assistant_model_device)
+
+    def _get_mapped_input_ids(self) -> list[int]:
         """
-        Get the input ids that are in the assistant vocab but not in the target vocab.
+        Get the input ids that are both in the assistant vocab and in the target vocab.
         """
-        assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
+        return torch.where(self._assistant_to_target_input_ids != -1)[0]
 
     def get_target_ids(
         self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor
@@ -602,33 +611,42 @@ def get_target_ids(
         Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
         Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
-        device = assistant_candidate_ids.device
-        target_candidate_ids = (
-            assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :]
-            .cpu()
-            .apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
-            .to(device)
-        )
-        return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
+
+        num_new_tokens = len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]
+        if num_new_tokens == 0:
+            return target_input_ids
+        else:
+            transformed_slice = self._assistant_to_target_input_ids[assistant_candidate_ids[0, -num_new_tokens:]]
+            return torch.cat((target_input_ids, transformed_slice.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
         """
         Return the target logits that correspond to the assistant logits.
         """
-        device = assistant_logits.device
-        target_vocab_size: int = len(self._target_tokenizer.get_vocab())
-        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
-        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(device)
-        assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
-        assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
-            -1
-        ]
-        target_logits_supported_indices: torch.IntTensor = (
-            assistant_logits_supported_indices.cpu()
-            .apply_(lambda x: self._assistant_to_target_input_ids[x])
-            .to(device)
-        )
-        target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
+
+        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], self.target_vocab_size)
+        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(self._assistant_model_device)
+        assistant_indices_mask = self._assistant_to_target_input_ids != -1  # Mask for valid indices
+        target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_indices_mask]  # Exclude invalid indices
+        valid_assistant_logits = assistant_logits[..., :self._assistant_to_target_input_ids.shape[0]]
+
+        target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask]
+
+        # assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
+        # assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
+        #     -1
+        # ]
+        # target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_logits_supported_indices]
+        # target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
+        # if hasattr(self, "_padding_size"):
+        #     padding = torch.full((target_logits.size(0), target_logits.size(1), self._padding_size), -float("inf")).to(
+        #         self._assistant_model_device
+        #     )
+        #     padding_side_actions = {
+        #         "right": lambda: torch.cat((target_logits, padding), dim=2),
+        #         "left": lambda: torch.cat((padding, target_logits), dim=2)
+        #     }
+        #     target_logits = padding_side_actions.get(self._target_tokenizer.padding_side, lambda: target_logits)()
         return target_logits
 
 
@@ -643,7 +661,12 @@ class AssistantVocabTranslatorCache:
 
     @classmethod
     def get_translator(
-        cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
+        cls,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        assistant_model_device,
+        target_vocab_size: int,
+        assistant_vocab_size: int,
     ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
@@ -653,7 +676,13 @@ def get_translator(
 
             mapping = assistant_dict.get(assistant_tokenizer)
             if mapping is None:
-                mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer)
+                mapping = AssistantToTargetTranslator(
+                    target_tokenizer,
+                    assistant_tokenizer,
+                    assistant_model_device,
+                    target_vocab_size,
+                    assistant_vocab_size,
+                )
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
@@ -692,11 +721,18 @@ def __init__(
         assistant_tokenizer: "PreTrainedTokenizerBase",
         generation_config: "GenerationConfig",
         model_kwargs: Dict,
+        target_vocab_size: int,
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
         # Initialize translator before parent class
-        self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+        self._atm_translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer,
+            assistant_tokenizer,
+            assistant_model.device,
+            target_vocab_size,
+            assistant_model.config.vocab_size,
+        )
         super().__init__(
             input_ids,
             assistant_model,
@@ -708,8 +744,9 @@ def __init__(
             logits_processor,
         )
         # Track sequence lengths and previous assistant IDs
-        self._prev_target_seq_len: int = 0
+        self._target_seq_len_with_candidates: int = 0
         self._prev_assistant_ids: Optional[torch.LongTensor] = None
+        self.target_vocab_size = target_vocab_size
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
@@ -732,15 +769,18 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         generation_args["generation_config"].return_dict_in_generate = True
 
         # Generate and process outputs using translator
+        generation_args["logits_processor"] = self._atm_translator.logits_processors
         assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
 
         # Use translator to convert tokens and logits
-        candidate_ids = assistant_output.sequences
-        candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_ids(assistant_input_ids, target_input_ids, candidate_ids)
+        self._prev_assistant_ids = assistant_output.sequences
+        target_ids = self._atm_translator.get_target_ids(
+            assistant_input_ids, target_input_ids, self._prev_assistant_ids
+        )
+        self._target_seq_len_with_candidates = target_ids.shape[-1]
         target_logits = self._atm_translator.get_target_logits(candidate_logits)
 
         return target_ids, target_logits
@@ -751,9 +791,11 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
         """
         # Calculate new tokens since last call
         target_seq_len = target_input_ids.shape[-1]
-        new_token_count = target_seq_len - self._prev_target_seq_len
+        if self._target_seq_len_with_candidates == 0:
+            new_token_count = target_seq_len
+        else:
+            new_token_count = 1
         target_new_ids = target_input_ids[:, -new_token_count:]
-        self._prev_target_seq_len = target_seq_len
 
         # Convert only the new tokens
         target_new_text = self.target_tokenizer.batch_decode(
@@ -765,11 +807,16 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
 
         # Update or initialize assistant IDs
         if self._prev_assistant_ids is None:
-            self._prev_assistant_ids = assistant_new_ids
+            assistant_input_ids = assistant_new_ids
         else:
-            self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
-
-        return self._prev_assistant_ids
+            tokens_to_remove = self._target_seq_len_with_candidates + 1 - target_seq_len
+            # If the number of new tokens is greater than zero, truncate the previous assistant IDs
+            if tokens_to_remove > 0:
+                self._prev_assistant_ids = self._prev_assistant_ids[:, :-tokens_to_remove]
+            assistant_input_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+        assistant_input_ids = assistant_input_ids.to(torch.int)
+
+        return assistant_input_ids
 
 
 class PromptLookupCandidateGenerator(CandidateGenerator):

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
@@ -1860,15 +1860,19 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     ```
     """
 
-    def __init__(self, suppress_tokens, device: str = "cpu"):
-        self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device)
+    def __init__(
+        self, mapped_tokens, assistant_vocab_size, assistant_model_device, filter_value: float = -float("Inf")
+    ):
+        # Initialize a tensor of size assistant_vocab_size with True values
+        self.suppress_token_mask = torch.ones(assistant_vocab_size, dtype=torch.bool, device=assistant_model_device)
+
+        # Set the values at indices specified in mapped_tokens to False
+        self.suppress_token_mask[mapped_tokens] = False
+        self.filter_value = filter_value
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device))
-        scores = torch.where(suppress_token_mask, -float("inf"), scores)
-        return scores
+        return scores.masked_fill_(self.suppress_token_mask, self.filter_value)
 
 
 class WhisperTimeStampLogitsProcessor(LogitsProcessor):

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -858,6 +858,7 @@ def _get_candidate_generator(
                         logits_processor=logits_processor,
                         target_tokenizer=target_tokenizer,
                         assistant_tokenizer=assistant_tokenizer,
+                        target_vocab_size=self.config.vocab_size,  # required in the case that self.config.vocab_size is different from the length of target_tokenizer.get_vocab()
                     )
                 case False:
                     candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
@@ -79,7 +79,7 @@ def test_get_assistant_to_target_input_ids(self):
     def test_get_suppress_input_ids(self):
         """Test the suppression of assistant input IDs not present in the target vocabulary."""
         expected_suppress_ids = [4]
-        actual_suppress_ids = self.translator.suppress_input_ids
+        actual_suppress_ids = self.translator._suppress_input_ids
         self.assertEqual(actual_suppress_ids, expected_suppress_ids)
 
     def test_get_target_ids(self):

diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
@@ -599,7 +599,8 @@ def test_serialize_generation_suppress_tokens(self):
             new_config = GenerationConfig.from_pretrained(tmp_dir)
         self.assertSequenceEqual(new_config.suppress_tokens, suppress_tokens)
 
-        suppress_processor = SuppressTokensLogitsProcessor(suppress_tokens=new_config.suppress_tokens)
+        # TODO
+        suppress_processor = SuppressTokensLogitsProcessor(mapped_tokens=new_config.suppress_tokens)
         self.assertSequenceEqual(suppress_processor.suppress_tokens, suppress_tokens)
 
     def test_serialize_generation_guidance_scale(self):