From aa7e01a6fd72713079da95852d1bf1764ba37397 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 16:30:34 -0500
Subject: [PATCH 01/76] move
 `TestAssistedCandidateGeneratorDifferentTokenizers` into a new testing file

---
 tests/generation/test_candidate_generator.py | 43 ++++++++++++++++++++
 tests/generation/test_utils.py               | 39 ------------------
 2 files changed, 43 insertions(+), 39 deletions(-)
 create mode 100644 tests/generation/test_candidate_generator.py

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
new file mode 100644
index 00000000000000..03fd51324b022f
--- /dev/null
+++ b/tests/generation/test_candidate_generator.py
@@ -0,0 +1,43 @@
+import unittest
+
+import numpy as np
+
+from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
+
+
+class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
+    def test_no_intersection(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[4, 5, 6]])
+        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
+        self.assertEqual(result, (None, None, None))
+
+    def test_complete_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_partial_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_no_new_tokens(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 063e9a3da8fdad..86d7c0b198c055 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -92,7 +92,6 @@
         WatermarkDetector,
         WatermarkingConfig,
     )
-    from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
     from transformers.generation.utils import _speculative_sampling
 
 
@@ -4274,41 +4273,3 @@ def test_generate_from_inputs_embeds_with_bos_token_id_is_none(self):
         # bos_token_id is required when no input ids nor inputs_embeds is passed
         with self.assertRaises(ValueError):
             model.generate(max_length=20, bos_token_id=None)
-
-
-class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
-    def test_no_intersection(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[4, 5, 6]])
-        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
-        self.assertEqual(result, (None, None, None))
-
-    def test_complete_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
-        )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
-
-    def test_partial_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
-        )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
-
-    def test_no_new_tokens(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
-        )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))

From f6b7f2041d7636dedb9185e2713d57df088793ab Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 16:32:53 -0500
Subject: [PATCH 02/76] refactor

---
 .../generation/candidate_generator.py         | 176 +++++++++---------
 1 file changed, 87 insertions(+), 89 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 7cab88a4bc2e65..d85860ad5dc459 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -194,45 +194,15 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             vocabulary_size)` containing the logits associated to each candidate.
         """
         input_ids = input_ids.to(self.assistant_model.device)
-
-        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
-        new_cur_len = input_ids.shape[-1]
-        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
-        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        # Calculate new tokens to generate
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(input_ids)
         if max_new_tokens == 0:
             return input_ids, None
-
-        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
-        # (which implicitly contains the number of accepted candidates from the previous round)
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-        if has_past_key_values:
-            new_cache_size = new_cur_len - 1
-            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
-            )  # the assistant does not have the token after the last match, hence the -1
-
-            self.assistant_kwargs = _prepare_attention_mask(
-                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-            )
-            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
-
-        # 2. Forecast next N tokens using the assistant model.
-        assistant_generation_kwargs = {
-            self.input_ids_key: input_ids,
-            "min_new_tokens": min_new_tokens,
-            "max_new_tokens": max_new_tokens,
-            "generation_config": self.generation_config,
-            "logits_processor": self.logits_processor,
-        }
-
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
-
-        # 3. Update variables for the next round of candidate generation
-        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-
-        # 4. Prepare variables for output
-        candidate_logits = torch.stack(assistant_output.scores, dim=1)
-        candidate_ids = assistant_output.sequences
+        # Update past key values and masks
+        self._update_past_and_masks(input_ids)
+        # Generate candidates
+        generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens)
+        candidate_ids, candidate_logits = self._generate_candidates(generation_args)
         return candidate_ids, candidate_logits
 
     def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
@@ -261,6 +231,45 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
             else:
                 self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
 
+    def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
+        """Calculate the minimum and maximum number of new tokens to generate."""
+        new_cur_len = input_ids.shape[-1]
+        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        return min_new_tokens, max_new_tokens
+
+    def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool:
+        """Update past key values and attention masks for subsequent generation rounds."""
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
+        return has_past_key_values
+
+    def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
+        """Prepare arguments for the generation call."""
+        return {
+            self.input_ids_key: input_ids,
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "generation_config": self.generation_config,
+            "logits_processor": self.logits_processor,
+        }
+
+    def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """Generate candidate sequences using the assistant model."""
+        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_ids = assistant_output.sequences
+        return candidate_ids, candidate_logits
+
 
 class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
     """
@@ -310,6 +319,8 @@ def __init__(
 
         self.target_tokenizer = target_tokenizer
         self.assistant_tokenizer = assistant_tokenizer
+        self.prev_target_ids = None
+        self.prev_tokens = None
         self.prev_assistant_ids = None
         self.target_lookbehind = assistant_model.generation_config.target_lookbehind
         self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind
@@ -440,18 +451,41 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             return input_ids, None
 
         input_ids = input_ids.to(self.assistant_model.device)
+        remove_from_pkv = 0
+
+        assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
+        self.prev_assistant_ids = assistant_input_ids
+
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
+
+        self._update_past_and_masks(assistant_input_ids, remove_from_pkv)
+        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+        self.assistant_kwargs.pop("attention_mask", None)
+
+        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences, assistant_input_ids)
+
+        # Update state
+        self.prev_target_ids = input_ids
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        self.prev_tokens = assistant_output.sequences
+
+        if input_ids.shape[1] >= new_target_ids.shape[1]:
+            return input_ids, None
+
+        return new_target_ids, None
+
+    def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]:
+        """Converts target input IDs to assistant input IDs, handling discrepancies."""
         convert_kwargs = {
             "source_tokenizer": self.target_tokenizer,
             "destination_tokenizer": self.assistant_tokenizer,
         }
         remove_from_pkv = 0
 
-        # Since re-encoding the tokens may result in tokenization discrepancies, we use 2 look behind values
-        # (one for each conversion) which mark where to start looking for the overlap between the
-        # source and target encodings, to ensure the new tokens include the correct prompt suffix.
-        if self.prev_assistant_ids is not None and input_ids.shape[1] > self.target_lookbehind:
+        if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind:
             # input_ids contains all target prompt input ids and some new target input ids
-            start_index_in_target_window = input_ids.shape[1] - self.target_lookbehind
+            start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind
 
             new_assistant_ids = self.convert_source_tokens_to_target_tokens(
                 input_ids[:, start_index_in_target_window:], **convert_kwargs
@@ -459,8 +493,8 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             prompt_use_length = new_assistant_ids.shape[1]
             prompt_use = self.prev_assistant_ids[:, -prompt_use_length:]
 
-            discrepancy_length, new_tokens_only, discrepancy_only = (
-                AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt_use, new_assistant_ids)
+            discrepancy_length, new_tokens_only, discrepancy_only = self._get_tokens_diag(
+                prompt_use, new_assistant_ids
             )
             assistant_input_ids = self.prev_assistant_ids
 
@@ -481,48 +515,21 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             else:
                 # edge case: in case of no intersection between prompt and new_assistant_ids
                 assistant_input_ids = torch.cat([assistant_input_ids, new_assistant_ids], dim=-1)
-
         else:
             assistant_input_ids = self.convert_source_tokens_to_target_tokens(input_ids, **convert_kwargs)
+            self.prev_target_ids = input_ids
 
-        self.prev_assistant_ids = assistant_input_ids
-        new_cur_len = assistant_input_ids.shape[-1]
-        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-
-        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
-        # (which implicitly contains the number of accepted candidates from the previous round)
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-        if has_past_key_values:
-            new_cache_size = new_cur_len - 1 - remove_from_pkv
-            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
-            )  # the assistant does not have the token after the last match, hence the -1
-
-            self.assistant_kwargs = _prepare_attention_mask(
-                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-            )
-            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
-
-        # 2. Forecast next N tokens using the assistant model.
-        assistant_generation_kwargs = {
-            self.input_ids_key: assistant_input_ids,
-            "min_new_tokens": min_new_tokens,
-            "max_new_tokens": max_new_tokens,
-            "generation_config": self.generation_config,
-            "logits_processor": self.logits_processor,
-        }
-
-        self.assistant_kwargs.pop("attention_mask", None)
-
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+        return assistant_input_ids, remove_from_pkv
 
+    def _process_assistant_outputs(
+        self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor, assistant_input_ids: torch.LongTensor
+    ) -> torch.LongTensor:
+        """Processes assistant outputs to obtain target input IDs."""
         num_prev_assistant = self.prev_assistant_ids.shape[1]
         start_assistant_look_index = num_prev_assistant - self.assistant_lookbehind
-        if start_assistant_look_index < 0:
-            start_assistant_look_index = 0
 
         new_target_ids_from_window = self.convert_source_tokens_to_target_tokens(
-            assistant_output.sequences[:, start_assistant_look_index:],
+            assistant_sequences[:, start_assistant_look_index:],
             source_tokenizer=self.assistant_tokenizer,
             destination_tokenizer=self.target_tokenizer,
         )
@@ -530,9 +537,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
 
         target_prompt_use = input_ids[:, -target_prompt_use_length:]
 
-        _, target_new_tokens_only, _ = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            target_prompt_use, new_target_ids_from_window
-        )
+        _, target_new_tokens_only, _ = self._get_tokens_diag(target_prompt_use, new_target_ids_from_window)
 
         new_target_ids = input_ids
 
@@ -546,14 +551,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         if hasattr(self.generation_config, "max_length"):
             new_target_ids = new_target_ids[:, : self.generation_config.max_length]
 
-        # 3. Update variables for the next round of candidate generation
-        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-
-        # 4. Prepare variables for output
-        if input_ids.shape[1] >= new_target_ids.shape[1]:
-            return input_ids, None
-
-        return new_target_ids, None
+        return new_target_ids
 
 
 class PromptLookupCandidateGenerator(CandidateGenerator):

From 0ded37c5d5c9dc0086dd9e56a91750117aa28469 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 17:04:57 -0500
Subject: [PATCH 03/76] NOTHING. add space to rerun github actions tests

---
 src/transformers/generation/candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d85860ad5dc459..43ed191df5d0af 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+ 
 import copy
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 

From d48b69b0cf405659ef0a18040a42612db1a7b929 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 17:05:04 -0500
Subject: [PATCH 04/76] remove it...

---
 src/transformers/generation/candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 43ed191df5d0af..d85860ad5dc459 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
- 
+
 import copy
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 

From b47e33a1c3e9ff14e6b25ca0eca9aa63b9d18f8c Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 13:54:03 -0500
Subject: [PATCH 05/76] `UniversalSpeculativeDecodingGenerator`

---
 .../generation/candidate_generator.py         | 132 +++++++++++++++++-
 1 file changed, 131 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d85860ad5dc459..bbe6f85024a137 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -21,7 +21,12 @@
 
 from ..cache_utils import DynamicCache
 from ..pytorch_utils import isin_mps_friendly
-from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
+from .logits_process import (
+    LogitNormalization,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+)
 
 
 if TYPE_CHECKING:
@@ -554,6 +559,131 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
+class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
+    """
+    `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
+    for the assistant and main models. This class generates candidates through the use of a smaller
+    model.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        target_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the target model.
+        assistant_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the assistant model.
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        target_vocab: dict[str, int] = target_tokenizer.get_vocab()
+        assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
+        self._target_to_assistant_input_ids: dict[int, int] = {
+            target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
+        }
+        # Suppress tokens that are in the assistant vocab but not in the target vocab
+        suppress_input_ids = list(
+            set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values())
+        )
+        logits_processor.append(
+            SuppressTokensLogitsProcessor(
+                suppress_tokens=suppress_input_ids,
+                device=assistant_model.device,
+            ),
+            LogitNormalization(),
+        )
+        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        self.prev_target_seq_len: int = 0
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        # input_ids = input_ids.to(self.assistant_model.device)
+
+        def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
+            target_seq_len = target_input_ids.shape[-1]
+            target_new_ids = target_input_ids[
+                :, -(target_seq_len - self.prev_target_seq_len) :
+            ]
+            return torch.tensor(
+                [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()],
+                device=self.assistant_model.device,
+            ).view(target_new_ids.shape)
+
+        input_ids = get_assistant_input_ids(input_ids)
+
+        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
+        new_cur_len = input_ids.shape[-1]
+        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        if max_new_tokens == 0:
+            return input_ids, None
+
+        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
+        # (which implicitly contains the number of accepted candidates from the previous round)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cache_size = new_cur_len - 1
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )  # the assistant does not have the token after the last match, hence the -1
+
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
+
+        # 2. Forecast next N tokens using the assistant model.
+        assistant_generation_kwargs = {
+            self.input_ids_key: input_ids,
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "generation_config": self.generation_config,
+            "logits_processor": self.logits_processor,
+        }
+
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+
+        # 3. Update variables for the next round of candidate generation
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+
+        # 4. Prepare variables for output
+        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_ids = assistant_output.sequences
+        return candidate_ids, candidate_logits
+
+
 class PromptLookupCandidateGenerator(CandidateGenerator):
     """
     `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up

From 8a991299aaba67414b1c6a6ef6dcfa2c01a90107 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 14:01:00 -0500
Subject: [PATCH 06/76] Use `UniversalSpeculativeDecodingGenerator` when
 `generation_config.do_sample=True`

---
 src/transformers/generation/utils.py | 36 ++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 05e39c4a9b56b5..07c743c6b79f3d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -57,6 +57,7 @@
     CandidateGenerator,
     EarlyExitCandidateGenerator,
     PromptLookupCandidateGenerator,
+    UniversalSpeculativeDecodingGenerator,
     _crop_past_key_values,
     _prepare_attention_mask,
     _prepare_token_type_ids,
@@ -846,16 +847,31 @@ def _get_candidate_generator(
                 max_length=generation_config.max_length,
             )
         elif different_tokenizers:
-            candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
-                input_ids=input_ids,
-                assistant_model=assistant_model,
-                generation_config=generation_config,
-                model_kwargs=model_kwargs,
-                inputs_tensor=inputs_tensor,
-                logits_processor=logits_processor,
-                target_tokenizer=target_tokenizer,
-                assistant_tokenizer=assistant_tokenizer,
-            )
+            match generation_config.do_sample:
+                case True:
+                    candidate_generator = UniversalSpeculativeDecodingGenerator(
+                        input_ids=input_ids,
+                        assistant_model=assistant_model,
+                        generation_config=generation_config,
+                        model_kwargs=model_kwargs,
+                        inputs_tensor=inputs_tensor,
+                        logits_processor=logits_processor,
+                        target_tokenizer=target_tokenizer,
+                        assistant_tokenizer=assistant_tokenizer,
+                    )
+                case False:
+                    candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
+                        input_ids=input_ids,
+                        assistant_model=assistant_model,
+                        generation_config=generation_config,
+                        model_kwargs=model_kwargs,
+                        inputs_tensor=inputs_tensor,
+                        logits_processor=logits_processor,
+                        target_tokenizer=target_tokenizer,
+                        assistant_tokenizer=assistant_tokenizer,
+                    )
+                case _:
+                    raise ValueError(f"Invalid value for `do_sample`: {generation_config.do_sample}")
         else:
             candidate_generator = AssistedCandidateGenerator(
                 input_ids=input_ids,

From 4649bd2f8406af58c96d845435fea9a8e6a80c17 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 15:08:46 -0500
Subject: [PATCH 07/76] assistant tokenizes only the target's new suffix

---
 .../generation/candidate_generator.py         | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index bbe6f85024a137..d23480a45ac8d1 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -599,12 +599,12 @@ def __init__(
     ):
         target_vocab: dict[str, int] = target_tokenizer.get_vocab()
         assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
-        self._target_to_assistant_input_ids: dict[int, int] = {
-            target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
+        self._assistant_to_target_input_ids: dict[int, int] = {
+            assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
         suppress_input_ids = list(
-            set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values())
+            set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())
         )
         logits_processor.append(
             SuppressTokensLogitsProcessor(
@@ -614,7 +614,8 @@ def __init__(
             LogitNormalization(),
         )
         super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
-        self.prev_target_seq_len: int = 0
+        self._prev_target_seq_len: int = 0
+        self._prev_assistant_ids: torch.LongTensor | None = None
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
@@ -629,19 +630,25 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
             vocabulary_size)` containing the logits associated to each candidate.
         """
-        # input_ids = input_ids.to(self.assistant_model.device)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
 
         def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
+            nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
             target_new_ids = target_input_ids[
-                :, -(target_seq_len - self.prev_target_seq_len) :
+                :, -(target_seq_len - self._prev_target_seq_len) :
             ]
-            return torch.tensor(
-                [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()],
-                device=self.assistant_model.device,
-            ).view(target_new_ids.shape)
+            # convert target_new_ids to string, and then, convert the string to assistant_new_ids
+            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
+            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
+            if self._prev_assistant_ids is None:
+                self._prev_assistant_ids = assistant_new_ids
+            else:
+                self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+            return self._prev_assistant_ids
 
         input_ids = get_assistant_input_ids(input_ids)
+        input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
@@ -652,7 +659,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
         # (which implicitly contains the number of accepted candidates from the previous round)
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
             new_cache_size = new_cur_len - 1
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
@@ -681,6 +688,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
+        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()])
         return candidate_ids, candidate_logits
 
 

From f199c94ac821cffe81b9ea04955d71792d078825 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 15:24:30 -0500
Subject: [PATCH 08/76] formatting

---
 src/transformers/generation/candidate_generator.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d23480a45ac8d1..6cda4df3153f05 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -603,9 +603,7 @@ def __init__(
             assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(
-            set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())
-        )
+        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()))
         logits_processor.append(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
@@ -635,11 +633,10 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
             nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
-            target_new_ids = target_input_ids[
-                :, -(target_seq_len - self._prev_target_seq_len) :
-            ]
-            # convert target_new_ids to string, and then, convert the string to assistant_new_ids
+            target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
+            # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
+            # Convert the string to assistant_new_ids
             assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids

From 19c0057dd56b5385f6f7d0a0379c4173b6e0e7ca Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Thu, 21 Nov 2024 05:20:50 -0800
Subject: [PATCH 09/76] fix code

---
 .../generation/candidate_generator.py         | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 6cda4df3153f05..16cd7487e4bafe 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -608,10 +608,10 @@ def __init__(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
                 device=assistant_model.device,
-            ),
-            LogitNormalization(),
+            )
         )
-        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        logits_processor.append(LogitNormalization())
+        super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor)
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
 
@@ -637,7 +637,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
             # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
+            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False)
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
@@ -645,6 +645,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             return self._prev_assistant_ids
 
         input_ids = get_assistant_input_ids(input_ids)
+        # Ensure input_ids is a 2D tensor
+        if isinstance(input_ids, list):
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
         input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -662,12 +665,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
                 self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
             )  # the assistant does not have the token after the last match, hence the -1
-
-            self.assistant_kwargs = _prepare_attention_mask(
-                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-            )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
 
+        # we need to update the attention mask to reflect the new input_ids length
+        self.assistant_kwargs = _prepare_attention_mask(
+            self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+        )
+
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
             self.input_ids_key: input_ids,
@@ -685,7 +689,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()])
+        device = candidate_ids.device
+        candidate_ids = candidate_ids.cpu()
+        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids.to(device)
         return candidate_ids, candidate_logits
 
 

From acf5a4b315c18a01c561f3a94cc49ae768d5265c Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Sun, 24 Nov 2024 05:15:37 -0800
Subject: [PATCH 10/76] fix code

---
 .../generation/candidate_generator.py         | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 16cd7487e4bafe..4d8aa41f1fcd29 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -603,7 +603,7 @@ def __init__(
             assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()))
+        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
         logits_processor.append(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
@@ -634,6 +634,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
             target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
+            self._prev_target_seq_len = target_seq_len
             # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
             # Convert the string to assistant_new_ids
@@ -641,20 +642,17 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
-                self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
-            return self._prev_assistant_ids
-
+                self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
+            return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
+        
+        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
-        # Ensure input_ids is a 2D tensor
-        if isinstance(input_ids, list):
-            input_ids = torch.tensor(input_ids).unsqueeze(0)
-        input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-        if max_new_tokens == 0:
+        if max_new_tokens <= 0:
             return input_ids, None
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
@@ -691,8 +689,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         candidate_ids = assistant_output.sequences
         device = candidate_ids.device
         candidate_ids = candidate_ids.cpu()
-        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x])
         candidate_ids = candidate_ids.to(device)
+        candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
+
         return candidate_ids, candidate_logits
 
 

From 3712117297f9c73def57285748b6376aeccfbff3 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 11:28:46 -0500
Subject: [PATCH 11/76] formatting

---
 .../generation/candidate_generator.py           | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 4d8aa41f1fcd29..da7a77adc97d6c 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -611,7 +611,16 @@ def __init__(
             )
         )
         logits_processor.append(LogitNormalization())
-        super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        super().__init__(
+            input_ids,
+            assistant_model,
+            target_tokenizer,
+            assistant_tokenizer,
+            generation_config,
+            model_kwargs,
+            inputs_tensor,
+            logits_processor,
+        )
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
 
@@ -644,7 +653,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             else:
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
-        
+
         target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
@@ -689,7 +698,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         candidate_ids = assistant_output.sequences
         device = candidate_ids.device
         candidate_ids = candidate_ids.cpu()
-        candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_(
+            lambda x: self._assistant_to_target_input_ids[x]
+        )
         candidate_ids = candidate_ids.to(device)
         candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
 

From 63f2f4622b418eda71cf0aad682bfa3ec9838a51 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 14:20:56 -0500
Subject: [PATCH 12/76] add `TestGenerateWithDifferentModels`

---
 tests/generation/test_candidate_generator.py | 63 ++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 03fd51324b022f..38840431816532 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -2,7 +2,9 @@
 
 import numpy as np
 
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
+from transformers.generation.utils import GenerationConfig
 
 
 class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
@@ -41,3 +43,64 @@ def test_no_new_tokens(self):
         self.assertEqual(discrep_length, 0)
         np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
         np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+
+class TestGenerateWithDifferentModels(unittest.TestCase):
+    """Tests generation with different target and assistant models."""
+
+    def test_generate_with_different_models(self):
+        # Use smaller test models instead
+        target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+        assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2"
+
+        prompt = "Alice and Bob"
+
+        # Load models sequentially and handle cleanup
+        target_model = AutoModelForCausalLM.from_pretrained(
+            target_model_checkpoint,
+        )
+        target_tokenizer = AutoTokenizer.from_pretrained(target_model_checkpoint)
+
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            assistant_checkpoint,
+        )
+        assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
+
+        # Tokenize input
+        input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device)
+
+        # Create generation configs
+        do_sample = False
+        base_config = GenerationConfig(
+            max_new_tokens=20,
+            do_sample=do_sample,
+        )
+
+        # Generate with and without assistant model
+        outputs_normal = target_model.generate(
+            input_ids,
+            generation_config=base_config,
+        )
+
+        # Pass the assistant model and tokenizers directly to the generate method
+        outputs_assisted = target_model.generate(
+            input_ids,
+            generation_config=base_config,
+            assistant_model=assistant_model,
+            tokenizer=target_tokenizer,
+            assistant_tokenizer=assistant_tokenizer,
+        )
+
+        # Decode outputs
+        text_normal = target_tokenizer.batch_decode(outputs_normal, skip_special_tokens=True)[0]
+        text_assisted = target_tokenizer.batch_decode(outputs_assisted, skip_special_tokens=True)[0]
+
+        # Basic validation
+        self.assertIsInstance(text_normal, str)
+        self.assertIsInstance(text_assisted, str)
+        self.assertGreater(len(text_normal), len(prompt))
+        self.assertGreater(len(text_assisted), len(prompt))
+        self.assertTrue(text_normal.startswith(prompt))
+        self.assertTrue(text_assisted.startswith(prompt))
+        if not do_sample:
+            self.assertEqual(text_normal, text_assisted)

From 6ac33f147121d7d2b5e51490369b06ba93443b0b Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 14:24:34 -0500
Subject: [PATCH 13/76] `TestGenerateWithDifferentModels` parameterize on
 `do_sample`

---
 tests/generation/test_candidate_generator.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 38840431816532..6ef031edc922fb 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,4 +1,5 @@
 import unittest
+from parameterized import parameterized
 
 import numpy as np
 
@@ -48,7 +49,11 @@ def test_no_new_tokens(self):
 class TestGenerateWithDifferentModels(unittest.TestCase):
     """Tests generation with different target and assistant models."""
 
-    def test_generate_with_different_models(self):
+    @parameterized.expand([
+        (False,),
+        (True,),
+    ])
+    def test_generate_with_different_models(self, do_sample):
         # Use smaller test models instead
         target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM"
         assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2"
@@ -70,7 +75,6 @@ def test_generate_with_different_models(self):
         input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device)
 
         # Create generation configs
-        do_sample = False
         base_config = GenerationConfig(
             max_new_tokens=20,
             do_sample=do_sample,

From 69383117d165687c6e1394945a4c4a201baaa2dc Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:06:16 -0500
Subject: [PATCH 14/76] `AssistantVocabMapping` & `AssistantVocabMappingCache`

---
 .../generation/candidate_generator.py         | 71 +++++++++++++++----
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index da7a77adc97d6c..ea1f7fa23e8632 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 
 import copy
+from functools import lru_cache
+import threading
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+import weakref
 
 import numpy as np
 import torch
@@ -559,6 +562,52 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
+class AssistantVocabMapping:
+    def __init__(self, target_tokenizer, assistant_tokenizer):
+        self.target_tokenizer = target_tokenizer
+        self.assistant_tokenizer = assistant_tokenizer
+        self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
+        self.suppress_input_ids = self._get_suppress_input_ids()
+
+    def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
+        """
+        Get a mapping from assistant tokens to target tokens based on vocabularies.
+        """
+        target_vocab = self.target_tokenizer.get_vocab()
+        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        return {
+            assistant_vocab[tok]: target_vocab[tok]
+            for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
+        }
+
+    def _get_suppress_input_ids(self) -> list[int]:
+        """
+        Get the input ids that are in the assistant vocab but not in the target vocab.
+        """
+        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys()))
+
+
+class AssistantVocabMappingCache:
+    _lock = threading.Lock()
+    _cache = weakref.WeakKeyDictionary()
+
+    @classmethod
+    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping:
+        with cls._lock:
+            assistant_dict = cls._cache.get(target_tokenizer)
+            if assistant_dict is None:
+                assistant_dict = weakref.WeakKeyDictionary()
+                cls._cache[target_tokenizer] = assistant_dict
+
+            mapping = assistant_dict.get(assistant_tokenizer)
+            if mapping is None:
+                mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer)
+                assistant_dict[assistant_tokenizer] = mapping
+
+            return mapping
+        
+
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
     `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
@@ -597,20 +646,18 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        target_vocab: dict[str, int] = target_tokenizer.get_vocab()
-        assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
-        self._assistant_to_target_input_ids: dict[int, int] = {
-            assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
-        }
-        # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
-        logits_processor.append(
+        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(
+            target_tokenizer, assistant_tokenizer
+        )
+        self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
+        logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=suppress_input_ids,
+                suppress_tokens=assistant_vocab_mapping.suppress_input_ids,
                 device=assistant_model.device,
-            )
-        )
-        logits_processor.append(LogitNormalization())
+            ),
+            LogitNormalization(),
+        ]
+
         super().__init__(
             input_ids,
             assistant_model,

From 5a0db3bb7322718171354ac9d87871b51a7624a4 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:06:35 -0500
Subject: [PATCH 15/76] formatting

---
 src/transformers/generation/candidate_generator.py | 12 ++++--------
 tests/generation/test_candidate_generator.py       | 12 +++++++-----
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index ea1f7fa23e8632..e35c9086adcf18 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 
 import copy
-from functools import lru_cache
 import threading
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 import weakref
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 import torch
@@ -576,8 +575,7 @@ def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         target_vocab = self.target_tokenizer.get_vocab()
         assistant_vocab = self.assistant_tokenizer.get_vocab()
         return {
-            assistant_vocab[tok]: target_vocab[tok]
-            for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
+            assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
         }
 
     def _get_suppress_input_ids(self) -> list[int]:
@@ -606,7 +604,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
-        
+
 
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
@@ -646,9 +644,7 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(
-            target_tokenizer, assistant_tokenizer
-        )
+        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
         self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
         logits_processor += [
             SuppressTokensLogitsProcessor(
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 6ef031edc922fb..f544a6cb2d838b 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,7 +1,7 @@
 import unittest
-from parameterized import parameterized
 
 import numpy as np
+from parameterized import parameterized
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
@@ -49,10 +49,12 @@ def test_no_new_tokens(self):
 class TestGenerateWithDifferentModels(unittest.TestCase):
     """Tests generation with different target and assistant models."""
 
-    @parameterized.expand([
-        (False,),
-        (True,),
-    ])
+    @parameterized.expand(
+        [
+            (False,),
+            (True,),
+        ]
+    )
     def test_generate_with_different_models(self, do_sample):
         # Use smaller test models instead
         target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM"

From 92f8ad3b55b65fffea2c85d2674be2e8486430d6 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:37:19 -0500
Subject: [PATCH 16/76] `AssistantToTargetTranslator`: `get_target_input_ids` &
 `get_target_logits`

---
 .../generation/candidate_generator.py         | 54 +++++++++++--------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index e35c9086adcf18..3576f3adee0eea 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -561,19 +561,19 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
-class AssistantVocabMapping:
+class AssistantToTargetTranslator:
     def __init__(self, target_tokenizer, assistant_tokenizer):
-        self.target_tokenizer = target_tokenizer
-        self.assistant_tokenizer = assistant_tokenizer
-        self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
+        self._target_tokenizer = target_tokenizer
+        self._assistant_tokenizer = assistant_tokenizer
+        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
         self.suppress_input_ids = self._get_suppress_input_ids()
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
         Get a mapping from assistant tokens to target tokens based on vocabularies.
         """
-        target_vocab = self.target_tokenizer.get_vocab()
-        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        target_vocab = self._target_tokenizer.get_vocab()
+        assistant_vocab = self._assistant_tokenizer.get_vocab()
         return {
             assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
         }
@@ -582,8 +582,26 @@ def _get_suppress_input_ids(self) -> list[int]:
         """
         Get the input ids that are in the assistant vocab but not in the target vocab.
         """
-        assistant_vocab = self.assistant_tokenizer.get_vocab()
-        return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys()))
+        assistant_vocab = self._assistant_tokenizer.get_vocab()
+        return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
+
+    def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Return the target input ids that correspond to the assistant input ids.
+        """
+        return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
+
+    def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Return the target logits that correspond to the assistant logits.
+        """
+        target_vocab_size = len(self._target_tokenizer.get_vocab())
+        ret: torch.FloatTensor = torch.zeros(
+            (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device
+        )
+        ret[:, self.suppress_input_ids] = -float("inf")
+        ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits
+        return ret
 
 
 class AssistantVocabMappingCache:
@@ -591,7 +609,7 @@ class AssistantVocabMappingCache:
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping:
+    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
             if assistant_dict is None:
@@ -600,7 +618,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap
 
             mapping = assistant_dict.get(assistant_tokenizer)
             if mapping is None:
-                mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer)
+                mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer)
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
@@ -644,11 +662,10 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
-        self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
+        self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
         logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=assistant_vocab_mapping.suppress_input_ids,
+                suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids,
                 device=assistant_model.device,
             ),
             LogitNormalization(),
@@ -697,7 +714,6 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
 
-        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -739,13 +755,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        device = candidate_ids.device
-        candidate_ids = candidate_ids.cpu()
-        candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_(
-            lambda x: self._assistant_to_target_input_ids[x]
-        )
-        candidate_ids = candidate_ids.to(device)
-        candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
+        candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
+
+        candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
 
         return candidate_ids, candidate_logits
 

From 7c8708ed3fd612d747a3ebaf42b2a4fe81ec2f07 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 18:11:57 -0500
Subject: [PATCH 17/76] improve `_get_assistant_to_target_input_ids` &
 formatting

---
 .../generation/candidate_generator.py         | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 3576f3adee0eea..2914b472d96f26 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -562,11 +562,15 @@ def _process_assistant_outputs(
 
 
 class AssistantToTargetTranslator:
-    def __init__(self, target_tokenizer, assistant_tokenizer):
-        self._target_tokenizer = target_tokenizer
-        self._assistant_tokenizer = assistant_tokenizer
-        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
-        self.suppress_input_ids = self._get_suppress_input_ids()
+    """
+    Translate the assistant into the target universe.
+    """
+
+    def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"):
+        self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
+        self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
+        self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
+        self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
@@ -595,13 +599,18 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         """
         Return the target logits that correspond to the assistant logits.
         """
-        target_vocab_size = len(self._target_tokenizer.get_vocab())
-        ret: torch.FloatTensor = torch.zeros(
-            (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device
+        target_vocab_size: int = len(self._target_tokenizer.get_vocab())
+        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
+        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf"))
+        assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
+        assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
+            -1
+        ]
+        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_(
+            lambda x: self._assistant_to_target_input_ids[x]
         )
-        ret[:, self.suppress_input_ids] = -float("inf")
-        ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits
-        return ret
+        target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
+        return target_logits
 
 
 class AssistantVocabMappingCache:
@@ -609,7 +618,9 @@ class AssistantVocabMappingCache:
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator:
+    def get_mapping(
+        cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
+    ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
             if assistant_dict is None:
@@ -755,11 +766,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
-
-        candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
+        target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
 
-        return candidate_ids, candidate_logits
+        target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
+        return target_ids, target_logits
 
 
 class PromptLookupCandidateGenerator(CandidateGenerator):

From 880d0aeaaec0c2d4857edb42c369f1707d2725d9 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 18:19:52 -0500
Subject: [PATCH 18/76] renaming

---
 src/transformers/generation/candidate_generator.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 2914b472d96f26..026f338960b5a8 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -613,12 +613,12 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         return target_logits
 
 
-class AssistantVocabMappingCache:
+class AssistantVocabTranslatorCache:
     _lock = threading.Lock()
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(
+    def get_translator(
         cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
     ) -> AssistantToTargetTranslator:
         with cls._lock:
@@ -673,10 +673,10 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
+        self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
         logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids,
+                suppress_tokens=self._atm_translator.suppress_input_ids,
                 device=assistant_model.device,
             ),
             LogitNormalization(),
@@ -766,9 +766,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
+        target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
 
-        target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
+        target_logits = self._atm_translator.get_target_logits(candidate_logits)
         return target_ids, target_logits
 
 

From d9b5e748a803f90950cb168119193a1d1599e26d Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 19:05:36 -0500
Subject: [PATCH 19/76] WIP: debugging `min_new_tokens`

---
 .../generation/candidate_generator.py         | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 026f338960b5a8..710ef04eb75a68 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -571,6 +571,12 @@ def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokeni
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
         self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
         self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self.logits_processors: LogitsProcessorList = LogitsProcessorList(
+            [
+                SuppressTokensLogitsProcessor(self.suppress_input_ids),
+                LogitNormalization(),
+            ]
+        )
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
@@ -674,14 +680,6 @@ def __init__(
         logits_processor: "LogitsProcessorList" = None,
     ):
         self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
-        logits_processor += [
-            SuppressTokensLogitsProcessor(
-                suppress_tokens=self._atm_translator.suppress_input_ids,
-                device=assistant_model.device,
-            ),
-            LogitNormalization(),
-        ]
-
         super().__init__(
             input_ids,
             assistant_model,
@@ -731,7 +729,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-        if max_new_tokens <= 0:
+        if max_new_tokens == 0:
             return input_ids, None
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
@@ -752,20 +750,27 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
             self.input_ids_key: input_ids,
-            "min_new_tokens": min_new_tokens,
-            "max_new_tokens": max_new_tokens,
+            # "min_new_tokens": min_new_tokens,
+            # "max_new_tokens": max_new_tokens,
+            "min_new_tokens": 100,
+            "max_new_tokens": 100,
             "generation_config": self.generation_config,
             "logits_processor": self.logits_processor,
         }
 
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         # 4. Prepare variables for output
-        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        # candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_logits = torch.stack(assistant_output.logits, dim=1)
+        if not candidate_logits.shape[1] > 1:
+            msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
+            raise Exception(msg)
         candidate_ids = assistant_output.sequences
+        candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
         target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
 
         target_logits = self._atm_translator.get_target_logits(candidate_logits)

From 25974d5e998260017b7f3846bf35644bc9551909 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Mon, 25 Nov 2024 04:03:41 -0800
Subject: [PATCH 20/76] fix get_target_ids

---
 .../generation/candidate_generator.py             | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 710ef04eb75a68..e2dcba1e2cb076 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -595,11 +595,16 @@ def _get_suppress_input_ids(self) -> list[int]:
         assistant_vocab = self._assistant_tokenizer.get_vocab()
         return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
 
-    def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor:
+    def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor:
         """
-        Return the target input ids that correspond to the assistant input ids.
+        Return the target candidate ids that correspond to the assistant candidate ids.
+        Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
+        Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
-        return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
+        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_(
+            lambda x: self._assistant_to_target_input_ids.get(x, x)
+        )
+        return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
         """
@@ -723,6 +728,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
 
+        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -764,14 +770,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         # 4. Prepare variables for output
-        # candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_logits = torch.stack(assistant_output.logits, dim=1)
         if not candidate_logits.shape[1] > 1:
             msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
             raise Exception(msg)
         candidate_ids = assistant_output.sequences
         candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
+        target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids)
 
         target_logits = self._atm_translator.get_target_logits(candidate_logits)
         return target_ids, target_logits

From b8636ab1d811ccbb950ab548a68c7a53dfcc192f Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 13:54:03 -0500
Subject: [PATCH 21/76] `UniversalSpeculativeDecodingGenerator`

---
 .../generation/candidate_generator.py         | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index e2dcba1e2cb076..d94e61d0a48f42 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -782,6 +782,131 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         return target_ids, target_logits
 
 
+class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
+    """
+    `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
+    for the assistant and main models. This class generates candidates through the use of a smaller
+    model.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        target_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the target model.
+        assistant_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the assistant model.
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        target_vocab: dict[str, int] = target_tokenizer.get_vocab()
+        assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
+        self._target_to_assistant_input_ids: dict[int, int] = {
+            target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
+        }
+        # Suppress tokens that are in the assistant vocab but not in the target vocab
+        suppress_input_ids = list(
+            set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values())
+        )
+        logits_processor.append(
+            SuppressTokensLogitsProcessor(
+                suppress_tokens=suppress_input_ids,
+                device=assistant_model.device,
+            ),
+            LogitNormalization(),
+        )
+        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        self.prev_target_seq_len: int = 0
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        # input_ids = input_ids.to(self.assistant_model.device)
+
+        def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
+            target_seq_len = target_input_ids.shape[-1]
+            target_new_ids = target_input_ids[
+                :, -(target_seq_len - self.prev_target_seq_len) :
+            ]
+            return torch.tensor(
+                [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()],
+                device=self.assistant_model.device,
+            ).view(target_new_ids.shape)
+
+        input_ids = get_assistant_input_ids(input_ids)
+
+        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
+        new_cur_len = input_ids.shape[-1]
+        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        if max_new_tokens == 0:
+            return input_ids, None
+
+        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
+        # (which implicitly contains the number of accepted candidates from the previous round)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cache_size = new_cur_len - 1
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )  # the assistant does not have the token after the last match, hence the -1
+
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
+
+        # 2. Forecast next N tokens using the assistant model.
+        assistant_generation_kwargs = {
+            self.input_ids_key: input_ids,
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "generation_config": self.generation_config,
+            "logits_processor": self.logits_processor,
+        }
+
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+
+        # 3. Update variables for the next round of candidate generation
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+
+        # 4. Prepare variables for output
+        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_ids = assistant_output.sequences
+        return candidate_ids, candidate_logits
+
+
 class PromptLookupCandidateGenerator(CandidateGenerator):
     """
     `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up

From 1ef46b787d70e7e4792628c2ae414b12c0fc3967 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 15:08:46 -0500
Subject: [PATCH 22/76] assistant tokenizes only the target's new suffix

---
 .../generation/candidate_generator.py         | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d94e61d0a48f42..bc2ad887f235d0 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -822,12 +822,12 @@ def __init__(
     ):
         target_vocab: dict[str, int] = target_tokenizer.get_vocab()
         assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
-        self._target_to_assistant_input_ids: dict[int, int] = {
-            target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
+        self._assistant_to_target_input_ids: dict[int, int] = {
+            assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
         suppress_input_ids = list(
-            set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values())
+            set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())
         )
         logits_processor.append(
             SuppressTokensLogitsProcessor(
@@ -837,7 +837,8 @@ def __init__(
             LogitNormalization(),
         )
         super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
-        self.prev_target_seq_len: int = 0
+        self._prev_target_seq_len: int = 0
+        self._prev_assistant_ids: torch.LongTensor | None = None
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
@@ -852,19 +853,25 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
             vocabulary_size)` containing the logits associated to each candidate.
         """
-        # input_ids = input_ids.to(self.assistant_model.device)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
 
         def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
+            nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
             target_new_ids = target_input_ids[
-                :, -(target_seq_len - self.prev_target_seq_len) :
+                :, -(target_seq_len - self._prev_target_seq_len) :
             ]
-            return torch.tensor(
-                [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()],
-                device=self.assistant_model.device,
-            ).view(target_new_ids.shape)
+            # convert target_new_ids to string, and then, convert the string to assistant_new_ids
+            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
+            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
+            if self._prev_assistant_ids is None:
+                self._prev_assistant_ids = assistant_new_ids
+            else:
+                self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+            return self._prev_assistant_ids
 
         input_ids = get_assistant_input_ids(input_ids)
+        input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
@@ -875,7 +882,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
         # (which implicitly contains the number of accepted candidates from the previous round)
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
             new_cache_size = new_cur_len - 1
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
@@ -904,6 +911,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
+        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()])
         return candidate_ids, candidate_logits
 
 

From f8e94eb152f37076d35a55cdbc6aa96fbcf27a87 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 16 Nov 2024 15:24:30 -0500
Subject: [PATCH 23/76] formatting

---
 src/transformers/generation/candidate_generator.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index bc2ad887f235d0..8fddeab851da68 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -826,9 +826,7 @@ def __init__(
             assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(
-            set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())
-        )
+        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()))
         logits_processor.append(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
@@ -858,11 +856,10 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
             nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
-            target_new_ids = target_input_ids[
-                :, -(target_seq_len - self._prev_target_seq_len) :
-            ]
-            # convert target_new_ids to string, and then, convert the string to assistant_new_ids
+            target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
+            # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
+            # Convert the string to assistant_new_ids
             assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids

From 439db84353092c879cb7132433c6d4f424fffb2a Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Thu, 21 Nov 2024 05:20:50 -0800
Subject: [PATCH 24/76] fix code

---
 .../generation/candidate_generator.py         | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 8fddeab851da68..138db57f8e17fe 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -831,10 +831,10 @@ def __init__(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
                 device=assistant_model.device,
-            ),
-            LogitNormalization(),
+            )
         )
-        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        logits_processor.append(LogitNormalization())
+        super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor)
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
 
@@ -860,7 +860,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
             # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False)
+            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False)
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
@@ -868,6 +868,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             return self._prev_assistant_ids
 
         input_ids = get_assistant_input_ids(input_ids)
+        # Ensure input_ids is a 2D tensor
+        if isinstance(input_ids, list):
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
         input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -885,12 +888,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
                 self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
             )  # the assistant does not have the token after the last match, hence the -1
-
-            self.assistant_kwargs = _prepare_attention_mask(
-                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-            )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
 
+        # we need to update the attention mask to reflect the new input_ids length
+        self.assistant_kwargs = _prepare_attention_mask(
+            self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+        )
+
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
             self.input_ids_key: input_ids,
@@ -908,7 +912,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()])
+        device = candidate_ids.device
+        candidate_ids = candidate_ids.cpu()
+        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids.to(device)
         return candidate_ids, candidate_logits
 
 

From 643901de16efaee0dc5ac74388a86c020b44341b Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Sun, 24 Nov 2024 05:15:37 -0800
Subject: [PATCH 25/76] fix code

---
 .../generation/candidate_generator.py         | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 138db57f8e17fe..868fbf655d39eb 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -826,7 +826,7 @@ def __init__(
             assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
         }
         # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()))
+        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
         logits_processor.append(
             SuppressTokensLogitsProcessor(
                 suppress_tokens=suppress_input_ids,
@@ -857,6 +857,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             nonlocal has_past_key_values
             target_seq_len = target_input_ids.shape[-1]
             target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
+            self._prev_target_seq_len = target_seq_len
             # Convert target_new_ids to string
             target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
             # Convert the string to assistant_new_ids
@@ -864,20 +865,17 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
-                self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
-            return self._prev_assistant_ids
-
+                self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
+            return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
+        
+        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
-        # Ensure input_ids is a 2D tensor
-        if isinstance(input_ids, list):
-            input_ids = torch.tensor(input_ids).unsqueeze(0)
-        input_ids = input_ids.to(self.assistant_model.device)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-        if max_new_tokens == 0:
+        if max_new_tokens <= 0:
             return input_ids, None
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
@@ -914,8 +912,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         candidate_ids = assistant_output.sequences
         device = candidate_ids.device
         candidate_ids = candidate_ids.cpu()
-        candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x])
         candidate_ids = candidate_ids.to(device)
+        candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
+
         return candidate_ids, candidate_logits
 
 

From 77097ffbe15d47f7fa1dc31ea23e14ba2b28aff3 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 11:28:46 -0500
Subject: [PATCH 26/76] formatting

---
 .../generation/candidate_generator.py           | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 868fbf655d39eb..011429b8804afc 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -834,7 +834,16 @@ def __init__(
             )
         )
         logits_processor.append(LogitNormalization())
-        super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor)
+        super().__init__(
+            input_ids,
+            assistant_model,
+            target_tokenizer,
+            assistant_tokenizer,
+            generation_config,
+            model_kwargs,
+            inputs_tensor,
+            logits_processor,
+        )
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
 
@@ -867,7 +876,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             else:
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
-        
+
         target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
@@ -912,7 +921,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         candidate_ids = assistant_output.sequences
         device = candidate_ids.device
         candidate_ids = candidate_ids.cpu()
-        candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x])
+        candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_(
+            lambda x: self._assistant_to_target_input_ids[x]
+        )
         candidate_ids = candidate_ids.to(device)
         candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
 

From d08b4f080ae6a315dd7d77b5ad95a9a3a387e3d3 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 14:24:34 -0500
Subject: [PATCH 27/76] `TestGenerateWithDifferentModels` parameterize on
 `do_sample`

---
 tests/generation/test_candidate_generator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index f544a6cb2d838b..ec6bc587124edc 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,4 +1,5 @@
 import unittest
+from parameterized import parameterized
 
 import numpy as np
 from parameterized import parameterized

From f242dc113294ac9c73b0d145c592b3096a3d881f Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:06:16 -0500
Subject: [PATCH 28/76] `AssistantVocabMapping` & `AssistantVocabMappingCache`

---
 .../generation/candidate_generator.py         | 69 +++++++++++++++----
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 011429b8804afc..0dc4de260cea24 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -17,6 +17,7 @@
 import threading
 import weakref
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+import weakref
 
 import numpy as np
 import torch
@@ -782,6 +783,52 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         return target_ids, target_logits
 
 
+class AssistantVocabMapping:
+    def __init__(self, target_tokenizer, assistant_tokenizer):
+        self.target_tokenizer = target_tokenizer
+        self.assistant_tokenizer = assistant_tokenizer
+        self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
+        self.suppress_input_ids = self._get_suppress_input_ids()
+
+    def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
+        """
+        Get a mapping from assistant tokens to target tokens based on vocabularies.
+        """
+        target_vocab = self.target_tokenizer.get_vocab()
+        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        return {
+            assistant_vocab[tok]: target_vocab[tok]
+            for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
+        }
+
+    def _get_suppress_input_ids(self) -> list[int]:
+        """
+        Get the input ids that are in the assistant vocab but not in the target vocab.
+        """
+        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys()))
+
+
+class AssistantVocabMappingCache:
+    _lock = threading.Lock()
+    _cache = weakref.WeakKeyDictionary()
+
+    @classmethod
+    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping:
+        with cls._lock:
+            assistant_dict = cls._cache.get(target_tokenizer)
+            if assistant_dict is None:
+                assistant_dict = weakref.WeakKeyDictionary()
+                cls._cache[target_tokenizer] = assistant_dict
+
+            mapping = assistant_dict.get(assistant_tokenizer)
+            if mapping is None:
+                mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer)
+                assistant_dict[assistant_tokenizer] = mapping
+
+            return mapping
+        
+
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
     `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
@@ -820,20 +867,18 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        target_vocab: dict[str, int] = target_tokenizer.get_vocab()
-        assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab()
-        self._assistant_to_target_input_ids: dict[int, int] = {
-            assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys()
-        }
-        # Suppress tokens that are in the assistant vocab but not in the target vocab
-        suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
-        logits_processor.append(
+        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(
+            target_tokenizer, assistant_tokenizer
+        )
+        self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
+        logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=suppress_input_ids,
+                suppress_tokens=assistant_vocab_mapping.suppress_input_ids,
                 device=assistant_model.device,
-            )
-        )
-        logits_processor.append(LogitNormalization())
+            ),
+            LogitNormalization(),
+        ]
+
         super().__init__(
             input_ids,
             assistant_model,

From ede117632bd552018960b67f2c83900022d9f4c6 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:06:35 -0500
Subject: [PATCH 29/76] formatting

---
 src/transformers/generation/candidate_generator.py | 10 ++++------
 tests/generation/test_candidate_generator.py       |  1 -
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 0dc4de260cea24..8d783a20fa39fc 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -18,6 +18,7 @@
 import weakref
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 import weakref
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 import torch
@@ -797,8 +798,7 @@ def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         target_vocab = self.target_tokenizer.get_vocab()
         assistant_vocab = self.assistant_tokenizer.get_vocab()
         return {
-            assistant_vocab[tok]: target_vocab[tok]
-            for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
+            assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
         }
 
     def _get_suppress_input_ids(self) -> list[int]:
@@ -827,7 +827,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
-        
+
 
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
@@ -867,9 +867,7 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(
-            target_tokenizer, assistant_tokenizer
-        )
+        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
         self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
         logits_processor += [
             SuppressTokensLogitsProcessor(
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index ec6bc587124edc..f544a6cb2d838b 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,5 +1,4 @@
 import unittest
-from parameterized import parameterized
 
 import numpy as np
 from parameterized import parameterized

From 511ee964c2b933c6005c7525f428e89c23f713d2 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 16:37:19 -0500
Subject: [PATCH 30/76] `AssistantToTargetTranslator`: `get_target_input_ids` &
 `get_target_logits`

---
 .../generation/candidate_generator.py         | 54 +++++++++++--------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 8d783a20fa39fc..3de2b27291b8ac 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -784,19 +784,19 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         return target_ids, target_logits
 
 
-class AssistantVocabMapping:
+class AssistantToTargetTranslator:
     def __init__(self, target_tokenizer, assistant_tokenizer):
-        self.target_tokenizer = target_tokenizer
-        self.assistant_tokenizer = assistant_tokenizer
-        self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
+        self._target_tokenizer = target_tokenizer
+        self._assistant_tokenizer = assistant_tokenizer
+        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
         self.suppress_input_ids = self._get_suppress_input_ids()
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
         Get a mapping from assistant tokens to target tokens based on vocabularies.
         """
-        target_vocab = self.target_tokenizer.get_vocab()
-        assistant_vocab = self.assistant_tokenizer.get_vocab()
+        target_vocab = self._target_tokenizer.get_vocab()
+        assistant_vocab = self._assistant_tokenizer.get_vocab()
         return {
             assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
         }
@@ -805,8 +805,26 @@ def _get_suppress_input_ids(self) -> list[int]:
         """
         Get the input ids that are in the assistant vocab but not in the target vocab.
         """
-        assistant_vocab = self.assistant_tokenizer.get_vocab()
-        return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys()))
+        assistant_vocab = self._assistant_tokenizer.get_vocab()
+        return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
+
+    def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Return the target input ids that correspond to the assistant input ids.
+        """
+        return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
+
+    def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Return the target logits that correspond to the assistant logits.
+        """
+        target_vocab_size = len(self._target_tokenizer.get_vocab())
+        ret: torch.FloatTensor = torch.zeros(
+            (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device
+        )
+        ret[:, self.suppress_input_ids] = -float("inf")
+        ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits
+        return ret
 
 
 class AssistantVocabMappingCache:
@@ -814,7 +832,7 @@ class AssistantVocabMappingCache:
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping:
+    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
             if assistant_dict is None:
@@ -823,7 +841,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap
 
             mapping = assistant_dict.get(assistant_tokenizer)
             if mapping is None:
-                mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer)
+                mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer)
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
@@ -867,11 +885,10 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
-        self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids
+        self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
         logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=assistant_vocab_mapping.suppress_input_ids,
+                suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids,
                 device=assistant_model.device,
             ),
             LogitNormalization(),
@@ -920,7 +937,6 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
 
-        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -962,13 +978,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        device = candidate_ids.device
-        candidate_ids = candidate_ids.cpu()
-        candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_(
-            lambda x: self._assistant_to_target_input_ids[x]
-        )
-        candidate_ids = candidate_ids.to(device)
-        candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1)
+        candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
+
+        candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
 
         return candidate_ids, candidate_logits
 

From 5e4794565ec203a1b739f59492b7d15cc821f2a8 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 18:11:57 -0500
Subject: [PATCH 31/76] improve `_get_assistant_to_target_input_ids` &
 formatting

---
 .../generation/candidate_generator.py         | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 3de2b27291b8ac..aaa2b7c22600d3 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -785,11 +785,15 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
 
 
 class AssistantToTargetTranslator:
-    def __init__(self, target_tokenizer, assistant_tokenizer):
-        self._target_tokenizer = target_tokenizer
-        self._assistant_tokenizer = assistant_tokenizer
-        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
-        self.suppress_input_ids = self._get_suppress_input_ids()
+    """
+    Translate the assistant into the target universe.
+    """
+
+    def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"):
+        self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
+        self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
+        self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
+        self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
@@ -818,13 +822,18 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         """
         Return the target logits that correspond to the assistant logits.
         """
-        target_vocab_size = len(self._target_tokenizer.get_vocab())
-        ret: torch.FloatTensor = torch.zeros(
-            (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device
+        target_vocab_size: int = len(self._target_tokenizer.get_vocab())
+        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
+        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf"))
+        assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
+        assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
+            -1
+        ]
+        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_(
+            lambda x: self._assistant_to_target_input_ids[x]
         )
-        ret[:, self.suppress_input_ids] = -float("inf")
-        ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits
-        return ret
+        target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
+        return target_logits
 
 
 class AssistantVocabMappingCache:
@@ -832,7 +841,9 @@ class AssistantVocabMappingCache:
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator:
+    def get_mapping(
+        cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
+    ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
             if assistant_dict is None:
@@ -978,11 +989,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
+        target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
 
-        candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
-
-        return candidate_ids, candidate_logits
+        target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
+        return target_ids, target_logits
 
 
 class PromptLookupCandidateGenerator(CandidateGenerator):

From 25a43497202238d043e8aca5c5c20bf42680ee51 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 18:19:52 -0500
Subject: [PATCH 32/76] renaming

---
 src/transformers/generation/candidate_generator.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index aaa2b7c22600d3..d895c6ada60086 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -836,12 +836,12 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         return target_logits
 
 
-class AssistantVocabMappingCache:
+class AssistantVocabTranslatorCache:
     _lock = threading.Lock()
     _cache = weakref.WeakKeyDictionary()
 
     @classmethod
-    def get_mapping(
+    def get_translator(
         cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
     ) -> AssistantToTargetTranslator:
         with cls._lock:
@@ -896,10 +896,10 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
-        self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer)
+        self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
         logits_processor += [
             SuppressTokensLogitsProcessor(
-                suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids,
+                suppress_tokens=self._atm_translator.suppress_input_ids,
                 device=assistant_model.device,
             ),
             LogitNormalization(),
@@ -989,9 +989,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_ids = assistant_output.sequences
-        target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids)
+        target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
 
-        target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits)
+        target_logits = self._atm_translator.get_target_logits(candidate_logits)
         return target_ids, target_logits
 
 

From 95fe744a8317cc4d83a9c4023c4044723cd09b3b Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sun, 24 Nov 2024 19:05:36 -0500
Subject: [PATCH 33/76] WIP: debugging `min_new_tokens`

---
 .../generation/candidate_generator.py         | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d895c6ada60086..428e4a03b7f458 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -794,6 +794,12 @@ def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokeni
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
         self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
         self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self.logits_processors: LogitsProcessorList = LogitsProcessorList(
+            [
+                SuppressTokensLogitsProcessor(self.suppress_input_ids),
+                LogitNormalization(),
+            ]
+        )
 
     def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
         """
@@ -897,14 +903,6 @@ def __init__(
         logits_processor: "LogitsProcessorList" = None,
     ):
         self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
-        logits_processor += [
-            SuppressTokensLogitsProcessor(
-                suppress_tokens=self._atm_translator.suppress_input_ids,
-                device=assistant_model.device,
-            ),
-            LogitNormalization(),
-        ]
-
         super().__init__(
             input_ids,
             assistant_model,
@@ -954,7 +952,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-        if max_new_tokens <= 0:
+        if max_new_tokens == 0:
             return input_ids, None
 
         # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
@@ -975,20 +973,27 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
             self.input_ids_key: input_ids,
-            "min_new_tokens": min_new_tokens,
-            "max_new_tokens": max_new_tokens,
+            # "min_new_tokens": min_new_tokens,
+            # "max_new_tokens": max_new_tokens,
+            "min_new_tokens": 100,
+            "max_new_tokens": 100,
             "generation_config": self.generation_config,
             "logits_processor": self.logits_processor,
         }
 
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs)
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         # 4. Prepare variables for output
-        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        # candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_logits = torch.stack(assistant_output.logits, dim=1)
+        if not candidate_logits.shape[1] > 1:
+            msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
+            raise Exception(msg)
         candidate_ids = assistant_output.sequences
+        candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
         target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
 
         target_logits = self._atm_translator.get_target_logits(candidate_logits)

From 0ad88b27f151e694a72a28da7c62e8cab28a7d7d Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Mon, 25 Nov 2024 04:03:41 -0800
Subject: [PATCH 34/76] fix get_target_ids

---
 .../generation/candidate_generator.py             | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 428e4a03b7f458..ce25032af4cf38 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -818,11 +818,16 @@ def _get_suppress_input_ids(self) -> list[int]:
         assistant_vocab = self._assistant_tokenizer.get_vocab()
         return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
 
-    def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor:
+    def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor:
         """
-        Return the target input ids that correspond to the assistant input ids.
+        Return the target candidate ids that correspond to the assistant candidate ids.
+        Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
+        Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
-        return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
+        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_(
+            lambda x: self._assistant_to_target_input_ids.get(x, x)
+        )
+        return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
         """
@@ -946,6 +951,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
                 self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
             return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
 
+        target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)
 
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
@@ -987,14 +993,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         # 4. Prepare variables for output
-        # candidate_logits = torch.stack(assistant_output.scores, dim=1)
         candidate_logits = torch.stack(assistant_output.logits, dim=1)
         if not candidate_logits.shape[1] > 1:
             msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
             raise Exception(msg)
         candidate_ids = assistant_output.sequences
         candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_input_ids(candidate_ids)
+        target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids)
 
         target_logits = self._atm_translator.get_target_logits(candidate_logits)
         return target_ids, target_logits

From bc5fa6174e7908e7a342e8aa2126771fae6d9d40 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Mon, 25 Nov 2024 05:53:43 -0800
Subject: [PATCH 35/76] fix device issue

---
 src/transformers/generation/candidate_generator.py | 12 +++++++-----
 src/transformers/generation/logits_process.py      |  7 ++++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index ce25032af4cf38..d395a735152787 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -824,25 +824,27 @@ def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candid
         Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
         Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
-        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_(
+        device = assistant_candidate_ids.device
+        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].cpu().apply_(
             lambda x: self._assistant_to_target_input_ids.get(x, x)
-        )
+        ).to(device)
         return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
         """
         Return the target logits that correspond to the assistant logits.
         """
+        device = assistant_logits.device
         target_vocab_size: int = len(self._target_tokenizer.get_vocab())
         target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
-        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf"))
+        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(device)
         assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
         assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
             -1
         ]
-        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_(
+        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.cpu().apply_(
             lambda x: self._assistant_to_target_input_ids[x]
-        )
+        ).to(device)
         target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
         return target_logits
 
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 39a38f9139ec1b..bdde3bb543e90c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1860,13 +1860,14 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     ```
     """
 
-    def __init__(self, suppress_tokens, device: str = "cpu"):
-        self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device)
+    def __init__(self, suppress_tokens):
+        self.suppress_tokens = suppress_tokens
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        suppress_tokens = torch.tensor(list(self.suppress_tokens), device=scores.device)
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, suppress_tokens)
         scores = torch.where(suppress_token_mask, -float("inf"), scores)
         return scores
 

From 41a5670eb19ca4237ea510fa8fbe53adc5a7c5ec Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Mon, 25 Nov 2024 06:18:30 -0800
Subject: [PATCH 36/76] fix get_assistant_input_ids

---
 src/transformers/generation/candidate_generator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d395a735152787..e13ef1603b19c4 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -944,14 +944,14 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
             self._prev_target_seq_len = target_seq_len
             # Convert target_new_ids to string
-            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
+            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False)
+            assistant_new_ids = self.assistant_tokenizer(target_new_toks, add_special_tokens=False, return_tensors="pt")["input_ids"]
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
-                self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
-            return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
+                self._prev_assistant_ids = torch.cat(self._prev_assistant_ids, assistant_new_ids, dim=-1)
+            return self._prev_assistant_ids.to(self.assistant_model.device)
 
         target_input_ids = input_ids.clone()
         input_ids = get_assistant_input_ids(input_ids)

From 44f7ba70a81a1cc5bb82839804fab22325fd480a Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 20:11:32 -0500
Subject: [PATCH 37/76] add `TestAssistedCandidateGeneratorDifferentTokenizers`

---
 tests/generation/test_candidate_generator.py | 157 +++++++------------
 1 file changed, 56 insertions(+), 101 deletions(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index f544a6cb2d838b..54c9030be640af 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,112 +1,67 @@
 import unittest
+from unittest.mock import MagicMock
 
-import numpy as np
-from parameterized import parameterized
+import torch
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
-from transformers.generation.utils import GenerationConfig
+from src.transformers.generation.candidate_generator import AssistantToTargetTranslator
 
 
-class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
-    def test_no_intersection(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[4, 5, 6]])
-        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
-        self.assertEqual(result, (None, None, None))
+class TestAssistantToTargetTranslator(unittest.TestCase):
+    def setUp(self):
+        # Create mock tokenizers with predefined vocabularies
+        self.target_tokenizer = MagicMock()
+        self.assistant_tokenizer = MagicMock()
 
-    def test_complete_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
-        )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+        # Define mock vocabularies for the tokenizers
+        self.target_vocab = {"hello": 0, "world": 1, "foo": 2, "bar": 3}
+        self.assistant_vocab = {"hello": 0, "world": 1, "foo": 2, "baz": 4}
 
-    def test_partial_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
-        )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+        self.target_tokenizer.get_vocab.return_value = self.target_vocab
+        self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab
 
-    def test_no_new_tokens(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
+        # Instantiate the class under test
+        self.translator = AssistantToTargetTranslator(
+            target_tokenizer=self.target_tokenizer, assistant_tokenizer=self.assistant_tokenizer
         )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
-
-
-class TestGenerateWithDifferentModels(unittest.TestCase):
-    """Tests generation with different target and assistant models."""
-
-    @parameterized.expand(
-        [
-            (False,),
-            (True,),
-        ]
-    )
-    def test_generate_with_different_models(self, do_sample):
-        # Use smaller test models instead
-        target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM"
-        assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2"
 
-        prompt = "Alice and Bob"
-
-        # Load models sequentially and handle cleanup
-        target_model = AutoModelForCausalLM.from_pretrained(
-            target_model_checkpoint,
-        )
-        target_tokenizer = AutoTokenizer.from_pretrained(target_model_checkpoint)
-
-        assistant_model = AutoModelForCausalLM.from_pretrained(
-            assistant_checkpoint,
+    def test_get_assistant_to_target_input_ids(self):
+        """Test the mapping from assistant tokens to target tokens."""
+        expected_mapping = {0: 0, 1: 1, 2: 2}
+        actual_mapping = self.translator._assistant_to_target_input_ids
+        self.assertEqual(actual_mapping, expected_mapping)
+
+    def test_get_suppress_input_ids(self):
+        """Test the suppression of assistant input IDs not present in the target vocabulary."""
+        expected_suppress_ids = [4]
+        actual_suppress_ids = self.translator.suppress_input_ids
+        self.assertEqual(actual_suppress_ids, expected_suppress_ids)
+
+    def test_get_target_ids(self):
+        """Test the translation of assistant candidate IDs to target candidate IDs."""
+        assistant_input_ids = torch.LongTensor([[0, 1, 2]])  # 'hello world foo' in assistant tokenizer
+        target_input_ids = torch.LongTensor([[0, 1, 2]])  # 'hello world foo' in target tokenizer
+        assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]])  # 'hello world foo baz' in assistant tokenizer
+
+        expected_target_ids = torch.LongTensor(
+            [[0, 1, 2, 4]]
+        )  # 'hello world foo baz' in target tokenizer (baz id remains 4)
+
+        actual_target_ids = self.translator.get_target_ids(
+            assistant_input_ids, target_input_ids, assistant_candidate_ids
         )
-        assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
-
-        # Tokenize input
-        input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device)
-
-        # Create generation configs
-        base_config = GenerationConfig(
-            max_new_tokens=20,
-            do_sample=do_sample,
-        )
-
-        # Generate with and without assistant model
-        outputs_normal = target_model.generate(
-            input_ids,
-            generation_config=base_config,
-        )
-
-        # Pass the assistant model and tokenizers directly to the generate method
-        outputs_assisted = target_model.generate(
-            input_ids,
-            generation_config=base_config,
-            assistant_model=assistant_model,
-            tokenizer=target_tokenizer,
-            assistant_tokenizer=assistant_tokenizer,
-        )
-
-        # Decode outputs
-        text_normal = target_tokenizer.batch_decode(outputs_normal, skip_special_tokens=True)[0]
-        text_assisted = target_tokenizer.batch_decode(outputs_assisted, skip_special_tokens=True)[0]
-
-        # Basic validation
-        self.assertIsInstance(text_normal, str)
-        self.assertIsInstance(text_assisted, str)
-        self.assertGreater(len(text_normal), len(prompt))
-        self.assertGreater(len(text_assisted), len(prompt))
-        self.assertTrue(text_normal.startswith(prompt))
-        self.assertTrue(text_assisted.startswith(prompt))
-        if not do_sample:
-            self.assertEqual(text_normal, text_assisted)
+        self.assertTrue(torch.equal(actual_target_ids, expected_target_ids))
+
+    def test_get_target_logits(self):
+        """Test the conversion of assistant logits to target logits."""
+        # Assistant logits for IDs 0, 1, 2
+        assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3]]])  # Shape (1, 1, 3)
+
+        # Expected target logits (target_vocab_size = 4)
+        expected_target_logits = torch.full((1, 1, 4), -float("inf"))
+        expected_target_logits[0, 0, 0] = 0.1  # 'hello'
+        expected_target_logits[0, 0, 1] = 0.2  # 'world'
+        expected_target_logits[0, 0, 2] = 0.3  # 'foo'
+        # The 'bar' token in target vocab remains at -inf
+
+        actual_target_logits = self.translator.get_target_logits(assistant_logits)
+        self.assertTrue(torch.equal(actual_target_logits, expected_target_logits))

From 57aafcca2e4d12f12a0aff64cf4c3eae26fa9aaf Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 20:11:38 -0500
Subject: [PATCH 38/76] formatting

---
 .../generation/candidate_generator.py         | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index e13ef1603b19c4..d21a428014b7de 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -818,16 +818,21 @@ def _get_suppress_input_ids(self) -> list[int]:
         assistant_vocab = self._assistant_tokenizer.get_vocab()
         return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
 
-    def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor:
+    def get_target_ids(
+        self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor
+    ) -> torch.LongTensor:
         """
         Return the target candidate ids that correspond to the assistant candidate ids.
         Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
         Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
         device = assistant_candidate_ids.device
-        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].cpu().apply_(
-            lambda x: self._assistant_to_target_input_ids.get(x, x)
-        ).to(device)
+        target_candidate_ids = (
+            assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :]
+            .cpu()
+            .apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
+            .to(device)
+        )
         return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
@@ -842,9 +847,11 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
         assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
             -1
         ]
-        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.cpu().apply_(
-            lambda x: self._assistant_to_target_input_ids[x]
-        ).to(device)
+        target_logits_supported_indices: torch.IntTensor = (
+            assistant_logits_supported_indices.cpu()
+            .apply_(lambda x: self._assistant_to_target_input_ids[x])
+            .to(device)
+        )
         target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
         return target_logits
 
@@ -944,9 +951,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
             self._prev_target_seq_len = target_seq_len
             # Convert target_new_ids to string
-            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            target_new_toks = self.target_tokenizer.batch_decode(
+                target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+            )
             # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer(target_new_toks, add_special_tokens=False, return_tensors="pt")["input_ids"]
+            assistant_new_ids = self.assistant_tokenizer(
+                target_new_toks, add_special_tokens=False, return_tensors="pt"
+            )["input_ids"]
             if self._prev_assistant_ids is None:
                 self._prev_assistant_ids = assistant_new_ids
             else:
@@ -989,7 +1000,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             "logits_processor": self.logits_processor,
         }
 
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
+        assistant_output = self.assistant_model.generate(
+            **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True
+        )
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values

From 6f95c33d3db8befc3fa56cd200822c9e84c80cb5 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 20:35:21 -0500
Subject: [PATCH 39/76] `AssistantVocabTranslatorCache` refactor & tests

---
 .../generation/candidate_generator.py         |  27 ++++-
 tests/generation/test_candidate_generator.py  | 110 +++++++++++++++++-
 2 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d21a428014b7de..cdfd63e7faf567 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -857,6 +857,11 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT
 
 
 class AssistantVocabTranslatorCache:
+    """
+    Cache for `AssistantToTargetTranslator` instances. The instances are computed at
+    pre-processing time, and this cache allows us to avoid recomputing them.
+    """
+
     _lock = threading.Lock()
     _cache = weakref.WeakKeyDictionary()
 
@@ -877,6 +882,25 @@ def get_translator(
 
             return mapping
 
+    @classmethod
+    def cleanup(cls):
+        """
+        Clean up dead references in the cache.
+        This removes entries where either the target_tokenizer or assistant_tokenizer
+        has been garbage collected.
+        """
+        with cls._lock:
+            # Remove entries from the outer cache where the target_tokenizer is no longer alive
+            dead_keys = [key for key in cls._cache if key is None]
+            for key in dead_keys:
+                del cls._cache[key]
+
+            # For each assistant_dict, remove entries where assistant_tokenizer is no longer alive
+            for assistant_dict in cls._cache.values():
+                dead_keys = [key for key in assistant_dict if key is None]
+                for key in dead_keys:
+                    del assistant_dict[key]
+
 
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
@@ -970,7 +994,8 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
-        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        # TODO: Debug
+        # min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
         if max_new_tokens == 0:
             return input_ids, None
 
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 54c9030be640af..1443a99634c168 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,9 +1,12 @@
+import gc
+import threading
 import unittest
+import weakref
 from unittest.mock import MagicMock
 
 import torch
 
-from src.transformers.generation.candidate_generator import AssistantToTargetTranslator
+from src.transformers.generation.candidate_generator import AssistantToTargetTranslator, AssistantVocabTranslatorCache
 
 
 class TestAssistantToTargetTranslator(unittest.TestCase):
@@ -65,3 +68,108 @@ def test_get_target_logits(self):
 
         actual_target_logits = self.translator.get_target_logits(assistant_logits)
         self.assertTrue(torch.equal(actual_target_logits, expected_target_logits))
+
+
+class MockTokenizer:
+    """A simple mock tokenizer class that supports weak references."""
+
+    def __init__(self, vocab=None):
+        self._vocab = vocab or {}
+
+    def get_vocab(self):
+        return self._vocab
+
+
+class TestAssistantVocabTranslatorCache(unittest.TestCase):
+    def setUp(self):
+        # Clear the cache before each test
+        AssistantVocabTranslatorCache._cache.clear()
+        # Create mock tokenizers with different vocabularies
+        self.target_tokenizer = MockTokenizer({"hello": 0, "world": 1})
+        self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2})
+        self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3})
+        self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5})
+
+    def test_same_instance_for_same_tokenizers(self):
+        """Test that the same translator is returned for the same tokenizers."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+        translator2 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+        self.assertIs(translator1, translator2, "Translators should be cached and identical")
+
+    def test_different_instances_for_different_tokenizers(self):
+        """Test that different tokenizers produce different translators."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.other_target_tokenizer, self.other_assistant_tokenizer
+        )
+        self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers")
+
+    def test_cache_with_weakref_key(self):
+        """Ensure that the cache uses weak references as keys."""
+        initial_cache_size = len(AssistantVocabTranslatorCache._cache)
+        target_tokenizer = MockTokenizer({"hello": 0})
+        assistant_tokenizer = MockTokenizer({"hello": 0})
+
+        # Store translator in a local variable to avoid it being kept alive
+        translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+        # Delete all strong references
+        del target_tokenizer
+        del assistant_tokenizer
+        del translator
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The cache size remains increased due to strong references
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+    def test_weakref_cache_cleanup(self):
+        """Test that the cache cleans up translators when tokenizers are garbage collected."""
+
+        def create_translator():
+            target_tokenizer = MockTokenizer({"hello": 0})
+            assistant_tokenizer = MockTokenizer({"hello": 0})
+            translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+            # Create weak references before returning
+            refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer))
+            # Remove strong references inside the function
+            del target_tokenizer
+            del assistant_tokenizer
+            del translator
+            return refs
+
+        translator_ref, target_ref, assistant_ref = create_translator()
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The tokenizers and translator are not garbage collected due to strong references
+        self.assertIsNotNone(target_ref(), "Target tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(assistant_ref(), "Assistant tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(translator_ref(), "Translator should still be alive due to strong references")
+
+    def test_thread_safety(self):
+        """Test that get_translator is thread-safe."""
+        translators = []
+
+        def get_translator():
+            translator = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+            translators.append(translator)
+
+        threads = [threading.Thread(target=get_translator) for _ in range(10)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+        # All translators should be the same instance
+        for translator in translators:
+            self.assertIs(translators[0], translator, "All translators should be identical across threads")

From 078f763eb731ef935879e674cb642ead181cd587 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 23:45:59 -0500
Subject: [PATCH 40/76] revert changes in
 `src/transformers/generation/logits_process.py`

---
 src/transformers/generation/logits_process.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index bdde3bb543e90c..39a38f9139ec1b 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1860,14 +1860,13 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     ```
     """
 
-    def __init__(self, suppress_tokens):
-        self.suppress_tokens = suppress_tokens
+    def __init__(self, suppress_tokens, device: str = "cpu"):
+        self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device)
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        suppress_tokens = torch.tensor(list(self.suppress_tokens), device=scores.device)
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, suppress_tokens)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens)
         scores = torch.where(suppress_token_mask, -float("inf"), scores)
         return scores
 

From faac2fcd9921b99392ec60aae689aa2461bf982e Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 21:41:29 -0500
Subject: [PATCH 41/76] refactor `AssistedCandidateGenerator`

---
 src/transformers/generation/candidate_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index cdfd63e7faf567..727dea96a94ba0 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -279,7 +279,6 @@ def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor,
         candidate_ids = assistant_output.sequences
         return candidate_ids, candidate_logits
 
-
 class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
     """
     `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers

From 76a2dd39981635725e653a8f5470f48929866ce9 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 21:52:23 -0500
Subject: [PATCH 42/76] refactor
 `AssistedCandidateGeneratorDifferentTokenizers`

---
 .../generation/candidate_generator.py             | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 727dea96a94ba0..c268e0f7ac2553 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -454,7 +454,14 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
             vocabulary_size)` containing the logits associated to each candidate.
         """
-        max_new_tokens = int(self.num_assistant_tokens)
+        input_ids = input_ids.to(self.assistant_model.device)
+        remove_from_pkv = 0
+
+        # New helper methods
+        assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
+        self.prev_assistant_ids = assistant_input_ids
+
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
         if max_new_tokens == 0:
             return input_ids, None
 
@@ -514,7 +521,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor
                     elif discrepancy_length > discrepancy_only.shape[1]:
                         discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1]
                         assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff]
-                        assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only
+                        assistant_input_ids[:, -discrepancy_only.shape[1]:] = discrepancy_only
 
                     remove_from_pkv = discrepancy_length
 
@@ -1024,9 +1031,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             "logits_processor": self.logits_processor,
         }
 
-        assistant_output = self.assistant_model.generate(
-            **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True
-        )
+        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values

From 43e96e7989469445e073e45c6a5abadfbec431c6 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 21:52:41 -0500
Subject: [PATCH 43/76] formatting

---
 src/transformers/generation/candidate_generator.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index c268e0f7ac2553..a254025326a7ff 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -279,6 +279,7 @@ def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor,
         candidate_ids = assistant_output.sequences
         return candidate_ids, candidate_logits
 
+
 class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
     """
     `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers
@@ -521,7 +522,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor
                     elif discrepancy_length > discrepancy_only.shape[1]:
                         discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1]
                         assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff]
-                        assistant_input_ids[:, -discrepancy_only.shape[1]:] = discrepancy_only
+                        assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only
 
                     remove_from_pkv = discrepancy_length
 
@@ -1031,7 +1032,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen
             "logits_processor": self.logits_processor,
         }
 
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
+        assistant_output = self.assistant_model.generate(
+            **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True
+        )
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values

From e63cb9df30874a0f84d61ddf9ea0cb0241de625c Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 25 Nov 2024 22:23:42 -0500
Subject: [PATCH 44/76] refactor `UniversalSpeculativeDecodingGenerator`

---
 .../generation/candidate_generator.py         | 149 ++++++------------
 1 file changed, 52 insertions(+), 97 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index a254025326a7ff..b41719b6d8ec58 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -458,7 +458,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         input_ids = input_ids.to(self.assistant_model.device)
         remove_from_pkv = 0
 
-        # New helper methods
         assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
         self.prev_assistant_ids = assistant_input_ids
 
@@ -912,28 +911,7 @@ def cleanup(cls):
 class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
     """
     `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
-    for the assistant and main models. This class generates candidates through the use of a smaller
-    model.
-
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-        assistant_model (`PreTrainedModel`):
-            The model to be used for generating candidates. This model should be smaller than the main model.
-        target_tokenizer (`PreTrainedTokenizerBase`):
-            The tokenizer used for the target model.
-        assistant_tokenizer (`PreTrainedTokenizerBase`):
-            The tokenizer used for the assistant model.
-        generation_config (`~generation.GenerationConfig`, *optional*):
-            The generation configuration to be used as base parametrization for the generation call.
-        logits_processor (`LogitsProcessorList`):
-            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-            used to modify the prediction scores of the language modeling head applied at each generation step.
-        model_kwargs (`Dict`):
-            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
-            model as well.
-        inputs_tensor (`torch.Tensor`, *optional*):
-            The model input tensor. In encoder-decoder models, this is the encoder input.
+    for the assistant and main models. This class generates candidates through the use of a smaller model.
     """
 
     def __init__(
@@ -947,6 +925,7 @@ def __init__(
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
+        # Initialize translator before parent class
         self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
         super().__init__(
             input_ids,
@@ -958,99 +937,75 @@ def __init__(
             inputs_tensor,
             logits_processor,
         )
+        # Track sequence lengths and previous assistant IDs
         self._prev_target_seq_len: int = 0
-        self._prev_assistant_ids: torch.LongTensor | None = None
+        self._prev_assistant_ids: Optional[torch.LongTensor] = None
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
-        Fetches the candidates to be tried for the current input.
-
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-
-        Return:
-            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
-            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
-            vocabulary_size)` containing the logits associated to each candidate.
+        Simplified version of get_candidates that uses the translator cache for token conversion.
         """
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-
-        def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
-            nonlocal has_past_key_values
-            target_seq_len = target_input_ids.shape[-1]
-            target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
-            self._prev_target_seq_len = target_seq_len
-            # Convert target_new_ids to string
-            target_new_toks = self.target_tokenizer.batch_decode(
-                target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-            )
-            # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer(
-                target_new_toks, add_special_tokens=False, return_tensors="pt"
-            )["input_ids"]
-            if self._prev_assistant_ids is None:
-                self._prev_assistant_ids = assistant_new_ids
-            else:
-                self._prev_assistant_ids = torch.cat(self._prev_assistant_ids, assistant_new_ids, dim=-1)
-            return self._prev_assistant_ids.to(self.assistant_model.device)
-
+        input_ids = input_ids.to(self.assistant_model.device)
         target_input_ids = input_ids.clone()
-        input_ids = get_assistant_input_ids(input_ids)
+        assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids)
 
-        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
-        new_cur_len = input_ids.shape[-1]
-        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
-        # TODO: Debug
-        # min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        # Standard generation steps
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
         if max_new_tokens == 0:
             return input_ids, None
 
-        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
-        # (which implicitly contains the number of accepted candidates from the previous round)
-        # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-        if has_past_key_values:
-            new_cache_size = new_cur_len - 1
-            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
-            )  # the assistant does not have the token after the last match, hence the -1
-            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
-
-        # we need to update the attention mask to reflect the new input_ids length
-        self.assistant_kwargs = _prepare_attention_mask(
-            self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-        )
-
-        # 2. Forecast next N tokens using the assistant model.
-        assistant_generation_kwargs = {
-            self.input_ids_key: input_ids,
-            # "min_new_tokens": min_new_tokens,
-            # "max_new_tokens": max_new_tokens,
-            "min_new_tokens": 100,
-            "max_new_tokens": 100,
-            "generation_config": self.generation_config,
-            "logits_processor": self.logits_processor,
-        }
+        self._update_past_and_masks(assistant_input_ids)
+        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
 
-        assistant_output = self.assistant_model.generate(
-            **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True
-        )
+        # Ensure scores are returned
+        generation_args["generation_config"].output_scores = True
+        generation_args["generation_config"].return_dict_in_generate = True
 
-        # 3. Update variables for the next round of candidate generation
+        # Generate and process outputs using translator
+        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
-        # 4. Prepare variables for output
-        candidate_logits = torch.stack(assistant_output.logits, dim=1)
-        if not candidate_logits.shape[1] > 1:
-            msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
-            raise Exception(msg)
+        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        if candidate_logits.shape[1] <= 1:
+            raise ValueError(
+                f"Expected at least 2 candidate tokens, but got {candidate_logits.shape[1]}. "
+                f"min_new_tokens: {generation_args['min_new_tokens']}, max_new_tokens: {generation_args['max_new_tokens']}."
+            )
+
+        # Use translator to convert tokens and logits
         candidate_ids = assistant_output.sequences
         candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids)
-
+        target_ids = self._atm_translator.get_target_ids(assistant_input_ids, target_input_ids, candidate_ids)
         target_logits = self._atm_translator.get_target_logits(candidate_logits)
+
         return target_ids, target_logits
 
+    def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Simplified token conversion that only processes new tokens.
+        """
+        # Calculate new tokens since last call
+        target_seq_len = target_input_ids.shape[-1]
+        new_token_count = target_seq_len - self._prev_target_seq_len
+        target_new_ids = target_input_ids[:, -new_token_count:]
+        self._prev_target_seq_len = target_seq_len
+
+        # Convert only the new tokens
+        target_new_text = self.target_tokenizer.batch_decode(
+            target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        assistant_new_ids = self.assistant_tokenizer(
+            target_new_text, add_special_tokens=False, return_tensors="pt"
+        )["input_ids"].to(self.assistant_model.device)
+
+        # Update or initialize assistant IDs
+        if self._prev_assistant_ids is None:
+            self._prev_assistant_ids = assistant_new_ids
+        else:
+            self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+
+        return self._prev_assistant_ids
+
 
 class PromptLookupCandidateGenerator(CandidateGenerator):
     """

From 8aa6020a5fb01acb6536f8c2a66f82744e2e8930 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Tue, 26 Nov 2024 03:33:56 -0800
Subject: [PATCH 45/76] fix negative value for max_new_tokens

---
 src/transformers/generation/candidate_generator.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index b41719b6d8ec58..b2b4a4a28508d9 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -951,7 +951,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
 
         # Standard generation steps
         min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
-        if max_new_tokens == 0:
+        if max_new_tokens <= 0:
             return input_ids, None
 
         self._update_past_and_masks(assistant_input_ids)
@@ -966,11 +966,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
 
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
-        if candidate_logits.shape[1] <= 1:
-            raise ValueError(
-                f"Expected at least 2 candidate tokens, but got {candidate_logits.shape[1]}. "
-                f"min_new_tokens: {generation_args['min_new_tokens']}, max_new_tokens: {generation_args['max_new_tokens']}."
-            )
 
         # Use translator to convert tokens and logits
         candidate_ids = assistant_output.sequences

From 2169973d3d6981aa3f9cb3a9a0480366303672a2 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Tue, 26 Nov 2024 04:49:08 -0800
Subject: [PATCH 46/76] fix generation length target + attention_mask vs.
 assistant + attent

---
 src/transformers/generation/candidate_generator.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index b2b4a4a28508d9..f51c12c223b123 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -940,6 +940,8 @@ def __init__(
         # Track sequence lengths and previous assistant IDs
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: Optional[torch.LongTensor] = None
+        # generation max length according to the assistant vocabulary
+        self.assistant_generation_max_length = -1
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
@@ -948,14 +950,21 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         input_ids = input_ids.to(self.assistant_model.device)
         target_input_ids = input_ids.clone()
         assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids)
+        if self.assistant_generation_max_length == -1:
+            self.assistant_generation_max_length = self.generation_config.max_length - input_ids.shape[1] + assistant_input_ids.shape[1]
 
         # Standard generation steps
+        target_generation_max_length = self.generation_config.max_length
+        self.generation_config.max_length = self.assistant_generation_max_length
         min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
-        if max_new_tokens <= 0:
+        self.generation_config.max_length = target_generation_max_length
+
+        if max_new_tokens == 0:
             return input_ids, None
 
         self._update_past_and_masks(assistant_input_ids)
         generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+        self.assistant_kwargs.pop("attention_mask", None)
 
         # Ensure scores are returned
         generation_args["generation_config"].output_scores = True

From c6da82736f75529458624c9c5a1e6e054238a8cc Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Tue, 26 Nov 2024 04:49:27 -0800
Subject: [PATCH 47/76] fix device

---
 src/transformers/generation/logits_process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 39a38f9139ec1b..5fcd35c921af86 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1866,7 +1866,7 @@ def __init__(self, suppress_tokens, device: str = "cpu"):
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device))
         scores = torch.where(suppress_token_mask, -float("inf"), scores)
         return scores
 

From 2cf9e8e5677ad2c18e83f694d8bbf0d6088f7ebd Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Wed, 27 Nov 2024 04:28:24 -0800
Subject: [PATCH 48/76] fix negative max_new_tokens bug

---
 src/transformers/generation/candidate_generator.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index f51c12c223b123..61bb3b7f3240fc 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -940,8 +940,6 @@ def __init__(
         # Track sequence lengths and previous assistant IDs
         self._prev_target_seq_len: int = 0
         self._prev_assistant_ids: Optional[torch.LongTensor] = None
-        # generation max length according to the assistant vocabulary
-        self.assistant_generation_max_length = -1
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
@@ -950,14 +948,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         input_ids = input_ids.to(self.assistant_model.device)
         target_input_ids = input_ids.clone()
         assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids)
-        if self.assistant_generation_max_length == -1:
-            self.assistant_generation_max_length = self.generation_config.max_length - input_ids.shape[1] + assistant_input_ids.shape[1]
-
-        # Standard generation steps
-        target_generation_max_length = self.generation_config.max_length
-        self.generation_config.max_length = self.assistant_generation_max_length
-        min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
-        self.generation_config.max_length = target_generation_max_length
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(target_input_ids)
 
         if max_new_tokens == 0:
             return input_ids, None

From a1c0d051c3d0336429aed8a2495143275fa7b871 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Thu, 28 Nov 2024 06:13:57 -0800
Subject: [PATCH 49/76] fix UAG

---
 src/transformers/generation/candidate_generator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 61bb3b7f3240fc..2bed5d7533dbdf 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -455,15 +455,17 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
             vocabulary_size)` containing the logits associated to each candidate.
         """
+        max_new_tokens = int(self.num_assistant_tokens)
+        if max_new_tokens == 0: # TODO
+            return input_ids, None
+        
         input_ids = input_ids.to(self.assistant_model.device)
         remove_from_pkv = 0
 
         assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
         self.prev_assistant_ids = assistant_input_ids
 
-        min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids)
-        if max_new_tokens == 0:
-            return input_ids, None
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
 
         input_ids = input_ids.to(self.assistant_model.device)
         remove_from_pkv = 0

From d8300913641218c628f0a1c3ba61b2ed09673a01 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Thu, 28 Nov 2024 06:17:03 -0800
Subject: [PATCH 50/76] minor

---
 src/transformers/generation/candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 2bed5d7533dbdf..c044b7913a1ec5 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -456,7 +456,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
             vocabulary_size)` containing the logits associated to each candidate.
         """
         max_new_tokens = int(self.num_assistant_tokens)
-        if max_new_tokens == 0: # TODO
+        if max_new_tokens == 0:
             return input_ids, None
         
         input_ids = input_ids.to(self.assistant_model.device)

From 19d0cceda7777a6074962a26645bc17b8245fee3 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 16:10:28 -0500
Subject: [PATCH 51/76] formatting

---
 src/transformers/generation/candidate_generator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index c044b7913a1ec5..3435310b255757 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -458,7 +458,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         max_new_tokens = int(self.num_assistant_tokens)
         if max_new_tokens == 0:
             return input_ids, None
-        
+
         input_ids = input_ids.to(self.assistant_model.device)
         remove_from_pkv = 0
 
@@ -991,9 +991,9 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
         target_new_text = self.target_tokenizer.batch_decode(
             target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
         )
-        assistant_new_ids = self.assistant_tokenizer(
-            target_new_text, add_special_tokens=False, return_tensors="pt"
-        )["input_ids"].to(self.assistant_model.device)
+        assistant_new_ids = self.assistant_tokenizer(target_new_text, add_special_tokens=False, return_tensors="pt")[
+            "input_ids"
+        ].to(self.assistant_model.device)
 
         # Update or initialize assistant IDs
         if self._prev_assistant_ids is None:

From 5b8217d17bfeb107884e822f01553d58928458ad Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Thu, 28 Nov 2024 16:14:07 -0500
Subject: [PATCH 52/76] `AssistedCandidateGeneratorDifferentTokenizers`
 `lookbehind`s init

---
 src/transformers/generation/candidate_generator.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 3435310b255757..5f1131a725509f 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -17,8 +17,6 @@
 import threading
 import weakref
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
-import weakref
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 import torch
@@ -329,7 +327,6 @@ def __init__(
         self.target_tokenizer = target_tokenizer
         self.assistant_tokenizer = assistant_tokenizer
         self.prev_target_ids = None
-        self.prev_tokens = None
         self.prev_assistant_ids = None
         self.target_lookbehind = assistant_model.generation_config.target_lookbehind
         self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind
@@ -485,7 +482,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         # Update state
         self.prev_target_ids = input_ids
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-        self.prev_tokens = assistant_output.sequences
+        self.prev_assistant_ids = assistant_output.sequences
 
         if input_ids.shape[1] >= new_target_ids.shape[1]:
             return input_ids, None
@@ -500,7 +497,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor
         }
         remove_from_pkv = 0
 
-        if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind:
+        if self.prev_assistant_ids is not None and self.prev_target_ids.shape[1] > self.target_lookbehind:
             # input_ids contains all target prompt input ids and some new target input ids
             start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind
 

From 9b0126a8efc4dd5b1be0564e7a8f32ae149bc472 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 30 Nov 2024 13:18:48 -0500
Subject: [PATCH 53/76] resolve conflict & formatting

---
 tests/generation/test_candidate_generator.py | 45 +++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 1443a99634c168..260f92109b7bb2 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -4,9 +4,52 @@
 import weakref
 from unittest.mock import MagicMock
 
+import numpy as np
 import torch
 
-from src.transformers.generation.candidate_generator import AssistantToTargetTranslator, AssistantVocabTranslatorCache
+from transformers.generation.candidate_generator import (
+    AssistantToTargetTranslator,
+    AssistantVocabTranslatorCache,
+    AssistedCandidateGeneratorDifferentTokenizers,
+)
+
+
+class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
+    def test_no_intersection(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[4, 5, 6]])
+        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
+        self.assertEqual(result, (None, None, None))
+
+    def test_complete_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_partial_overlap(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+
+    def test_no_new_tokens(self):
+        prompt = np.array([[1, 2, 3]])
+        prompt_plus_new_tokens = np.array([[1, 2, 3]])
+        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
+            prompt, prompt_plus_new_tokens
+        )
+        self.assertEqual(discrep_length, 0)
+        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
+        np.testing.assert_array_equal(discrep_only, np.array([[]]))
 
 
 class TestAssistantToTargetTranslator(unittest.TestCase):

From 578d0b348aea95cbeb95b5f89d728f6e1eda1535 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 30 Nov 2024 13:57:28 -0500
Subject: [PATCH 54/76] rerun CI tests

---
 tests/generation/test_candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 260f92109b7bb2..798e53de53ca75 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -12,7 +12,7 @@
     AssistantVocabTranslatorCache,
     AssistedCandidateGeneratorDifferentTokenizers,
 )
-
+ 
 
 class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
     def test_no_intersection(self):

From 7db269547a01085845cf2cec3d62a3a7e26b9675 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 30 Nov 2024 13:57:46 -0500
Subject: [PATCH 55/76] remove space...

---
 tests/generation/test_candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 798e53de53ca75..260f92109b7bb2 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -12,7 +12,7 @@
     AssistantVocabTranslatorCache,
     AssistedCandidateGeneratorDifferentTokenizers,
 )
- 
+
 
 class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
     def test_no_intersection(self):

From fb699001e14ed188e3e9661265dc2320ad60116a Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 2 Dec 2024 20:30:36 -0500
Subject: [PATCH 56/76] remove old code

---
 .../generation/candidate_generator.py         | 229 ------------------
 1 file changed, 229 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 5f1131a725509f..25148b3a6b837c 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -464,14 +464,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
 
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
 
-        input_ids = input_ids.to(self.assistant_model.device)
-        remove_from_pkv = 0
-
-        assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
-        self.prev_assistant_ids = assistant_input_ids
-
-        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
-
         self._update_past_and_masks(assistant_input_ids, remove_from_pkv)
         generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
         self.assistant_kwargs.pop("attention_mask", None)
@@ -568,227 +560,6 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
-class AssistantToTargetTranslator:
-    """
-    Translate the assistant into the target universe.
-    """
-
-    def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"):
-        self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
-        self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
-        self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
-        self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
-        self.logits_processors: LogitsProcessorList = LogitsProcessorList(
-            [
-                SuppressTokensLogitsProcessor(self.suppress_input_ids),
-                LogitNormalization(),
-            ]
-        )
-
-    def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
-        """
-        Get a mapping from assistant tokens to target tokens based on vocabularies.
-        """
-        target_vocab = self._target_tokenizer.get_vocab()
-        assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return {
-            assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
-        }
-
-    def _get_suppress_input_ids(self) -> list[int]:
-        """
-        Get the input ids that are in the assistant vocab but not in the target vocab.
-        """
-        assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
-
-    def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor:
-        """
-        Return the target candidate ids that correspond to the assistant candidate ids.
-        Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
-        Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
-        """
-        target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_(
-            lambda x: self._assistant_to_target_input_ids.get(x, x)
-        )
-        return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
-
-    def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
-        """
-        Return the target logits that correspond to the assistant logits.
-        """
-        target_vocab_size: int = len(self._target_tokenizer.get_vocab())
-        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
-        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf"))
-        assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
-        assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
-            -1
-        ]
-        target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_(
-            lambda x: self._assistant_to_target_input_ids[x]
-        )
-        target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
-        return target_logits
-
-
-class AssistantVocabTranslatorCache:
-    _lock = threading.Lock()
-    _cache = weakref.WeakKeyDictionary()
-
-    @classmethod
-    def get_translator(
-        cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
-    ) -> AssistantToTargetTranslator:
-        with cls._lock:
-            assistant_dict = cls._cache.get(target_tokenizer)
-            if assistant_dict is None:
-                assistant_dict = weakref.WeakKeyDictionary()
-                cls._cache[target_tokenizer] = assistant_dict
-
-            mapping = assistant_dict.get(assistant_tokenizer)
-            if mapping is None:
-                mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer)
-                assistant_dict[assistant_tokenizer] = mapping
-
-            return mapping
-
-
-class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
-    """
-    `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
-    for the assistant and main models. This class generates candidates through the use of a smaller
-    model.
-
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-        assistant_model (`PreTrainedModel`):
-            The model to be used for generating candidates. This model should be smaller than the main model.
-        target_tokenizer (`PreTrainedTokenizerBase`):
-            The tokenizer used for the target model.
-        assistant_tokenizer (`PreTrainedTokenizerBase`):
-            The tokenizer used for the assistant model.
-        generation_config (`~generation.GenerationConfig`, *optional*):
-            The generation configuration to be used as base parametrization for the generation call.
-        logits_processor (`LogitsProcessorList`):
-            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-            used to modify the prediction scores of the language modeling head applied at each generation step.
-        model_kwargs (`Dict`):
-            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
-            model as well.
-        inputs_tensor (`torch.Tensor`, *optional*):
-            The model input tensor. In encoder-decoder models, this is the encoder input.
-    """
-
-    def __init__(
-        self,
-        input_ids: torch.LongTensor,
-        assistant_model: "PreTrainedModel",
-        target_tokenizer: "PreTrainedTokenizerBase",
-        assistant_tokenizer: "PreTrainedTokenizerBase",
-        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
-        inputs_tensor: Optional[torch.Tensor] = None,
-        logits_processor: "LogitsProcessorList" = None,
-    ):
-        self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
-        super().__init__(
-            input_ids,
-            assistant_model,
-            target_tokenizer,
-            assistant_tokenizer,
-            generation_config,
-            model_kwargs,
-            inputs_tensor,
-            logits_processor,
-        )
-        self._prev_target_seq_len: int = 0
-        self._prev_assistant_ids: torch.LongTensor | None = None
-
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
-        """
-        Fetches the candidates to be tried for the current input.
-
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-
-        Return:
-            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
-            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
-            vocabulary_size)` containing the logits associated to each candidate.
-        """
-        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-
-        def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor:
-            nonlocal has_past_key_values
-            target_seq_len = target_input_ids.shape[-1]
-            target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :]
-            self._prev_target_seq_len = target_seq_len
-            # Convert target_new_ids to string
-            target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False)
-            # Convert the string to assistant_new_ids
-            assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False)
-            if self._prev_assistant_ids is None:
-                self._prev_assistant_ids = assistant_new_ids
-            else:
-                self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids
-            return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device)
-
-        target_input_ids = input_ids.clone()
-        input_ids = get_assistant_input_ids(input_ids)
-
-        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
-        new_cur_len = input_ids.shape[-1]
-        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
-        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
-        if max_new_tokens == 0:
-            return input_ids, None
-
-        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
-        # (which implicitly contains the number of accepted candidates from the previous round)
-        # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
-        if has_past_key_values:
-            new_cache_size = new_cur_len - 1
-            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
-            )  # the assistant does not have the token after the last match, hence the -1
-            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
-
-        # we need to update the attention mask to reflect the new input_ids length
-        self.assistant_kwargs = _prepare_attention_mask(
-            self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
-        )
-
-        # 2. Forecast next N tokens using the assistant model.
-        assistant_generation_kwargs = {
-            self.input_ids_key: input_ids,
-            # "min_new_tokens": min_new_tokens,
-            # "max_new_tokens": max_new_tokens,
-            "min_new_tokens": 100,
-            "max_new_tokens": 100,
-            "generation_config": self.generation_config,
-            "logits_processor": self.logits_processor,
-        }
-
-        assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True)
-
-        # 3. Update variables for the next round of candidate generation
-        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-
-        # 4. Prepare variables for output
-        candidate_logits = torch.stack(assistant_output.logits, dim=1)
-        if not candidate_logits.shape[1] > 1:
-            msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates."
-            raise Exception(msg)
-        candidate_ids = assistant_output.sequences
-        candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids)
-
-        target_logits = self._atm_translator.get_target_logits(candidate_logits)
-        return target_ids, target_logits
-
-
 class AssistantToTargetTranslator:
     """
     Translate the assistant into the target universe.

From e40c775c8b5671e17357872221b71c751e98f995 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Wed, 4 Dec 2024 05:04:38 -0800
Subject: [PATCH 57/76] fix candidate_input_ids device

---
 src/transformers/generation/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 07c743c6b79f3d..77535d18464ee2 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4280,7 +4280,8 @@ def _assisted_decoding(
 
             #  1. Fetch candidate sequences from a `CandidateGenerator`
             candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
-
+            candidate_input_ids = candidate_input_ids.to(self.device)
+            
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
 
@@ -4482,9 +4483,12 @@ def _speculative_sampling(
     # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens
     # selected by the assistant, respectively.
     q = candidate_logits.softmax(dim=-1)
+    device = new_candidate_input_ids.device
+    #new_candidate_input_ids = new_candidate_input_ids.to(q.device)
     q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
     p = new_logits.softmax(dim=-1)
     p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
+    #new_candidate_input_ids = new_candidate_input_ids.to(device)
     probability_ratio = p_i / q_i
 
     # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller

From b5ce873fb930c83cdaeb85fa7fbded0dfb06c1bd Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Wed, 4 Dec 2024 05:10:06 -0800
Subject: [PATCH 58/76] minor

---
 src/transformers/generation/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 77535d18464ee2..90c23fe7f08dff 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4483,12 +4483,9 @@ def _speculative_sampling(
     # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens
     # selected by the assistant, respectively.
     q = candidate_logits.softmax(dim=-1)
-    device = new_candidate_input_ids.device
-    #new_candidate_input_ids = new_candidate_input_ids.to(q.device)
     q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
     p = new_logits.softmax(dim=-1)
     p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
-    #new_candidate_input_ids = new_candidate_input_ids.to(device)
     probability_ratio = p_i / q_i
 
     # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller

From d34d7eaa420b42ffc501e9fc2eb4c31c52d023f1 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Wed, 4 Dec 2024 20:33:32 -0500
Subject: [PATCH 59/76] formatting

---
 src/transformers/generation/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 90c23fe7f08dff..7a9d78168ac903 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4281,7 +4281,7 @@ def _assisted_decoding(
             #  1. Fetch candidate sequences from a `CandidateGenerator`
             candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
             candidate_input_ids = candidate_input_ids.to(self.device)
-            
+
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
 

From 9d4d9f9a8cd53b2cdf0fae22846bc07012b29507 Mon Sep 17 00:00:00 2001
From: Jonathan Mamou <jonathan.mamou@intel.com>
Date: Wed, 18 Dec 2024 00:52:10 +0200
Subject: [PATCH 60/76] Fix prepare + apply (#7)

* fix prepare + apply

* move to cpu

* simplity suppress_tokens

* fix bugs and refacatoring

* device move

* handle self.config.vocab_size > len(target_tokenizer.get_vocab())

* no need to normalize in candidate_generator

* address Nadav's comments + minor

* optimize device move + SuppressTokensLogitsProcessor

* AssistantToTargetTranslator, SuppressTokensLogitsProcessor and tokenizers mapping improvements

* padding size

* padding improvement

* fix and simplify get_target_logits

* renaming in get_target_logits

* minor

* add filter_value and suppress_tokens_id

* style + rename

* remove TODO

* restore original SelectTokensLogitsProcessor with modification

* fix style

* fix _update_past_and_masks and optimize code

* remove assistant_vocab_size arg

* fix attention_mask

* call _prepare_attention_mask also if not has_past_key_values

* handling attention mask for first generation

* comment

* restore test

* remove SelectTokensLogitsProcessor

* _update_past_and_masks implementation for USD
---
 .../generation/candidate_generator.py         | 158 +++++++++++-------
 src/transformers/generation/logits_process.py |   7 +-
 src/transformers/generation/utils.py          |   2 +
 tests/generation/test_candidate_generator.py  |   2 +-
 4 files changed, 100 insertions(+), 69 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 25148b3a6b837c..a37481018086d0 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -24,7 +24,6 @@
 from ..cache_utils import DynamicCache
 from ..pytorch_utils import isin_mps_friendly
 from .logits_process import (
-    LogitNormalization,
     LogitsProcessorList,
     MinLengthLogitsProcessor,
     SuppressTokensLogitsProcessor,
@@ -245,18 +244,21 @@ def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
         return min_new_tokens, max_new_tokens
 
-    def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool:
+    def _update_past_and_masks(
+        self, input_ids: torch.LongTensor, remove_from_pkv: int = 0, num_added_tokens: int = 1
+    ) -> bool:
         """Update past key values and attention masks for subsequent generation rounds."""
         has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
             new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - num_added_tokens
             )
             self.assistant_kwargs = _prepare_attention_mask(
                 self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
             )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
+
         return has_past_key_values
 
     def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
@@ -565,34 +567,41 @@ class AssistantToTargetTranslator:
     Translate the assistant into the target universe.
     """
 
-    def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"):
+    def __init__(
+        self,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        assistant_model_device,
+        target_vocab_size: int,
+        filter_value: float = -float("Inf"),
+        suppress_tokens_id: int = -1,
+    ):
         self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
-        self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids()
-        self.suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self._assistant_model_device = assistant_model_device
+        self.target_vocab_size: int = target_vocab_size
+        self.filter_value: float = filter_value
+        self.suppress_tokens_id: int = suppress_tokens_id
+        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
         self.logits_processors: LogitsProcessorList = LogitsProcessorList(
-            [
-                SuppressTokensLogitsProcessor(self.suppress_input_ids),
-                LogitNormalization(),
-            ]
+            [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
         )
 
-    def _get_assistant_to_target_input_ids(self) -> dict[int, int]:
-        """
-        Get a mapping from assistant tokens to target tokens based on vocabularies.
-        """
+    def _get_assistant_to_target_input_ids(self):
         target_vocab = self._target_tokenizer.get_vocab()
         assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return {
-            assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys())
-        }
+        max_assistant_index = max(assistant_vocab.values())
+        assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int)
+        for tok, idx in assistant_vocab.items():
+            if tok in target_vocab:
+                assistant_to_target_input_ids[idx] = target_vocab[tok]
+        return assistant_to_target_input_ids.to(self._assistant_model_device)
 
     def _get_suppress_input_ids(self) -> list[int]:
         """
         Get the input ids that are in the assistant vocab but not in the target vocab.
         """
-        assistant_vocab = self._assistant_tokenizer.get_vocab()
-        return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys()))
+        return torch.where(self._assistant_to_target_input_ids == self.suppress_tokens_id)[0]
 
     def get_target_ids(
         self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor
@@ -602,33 +611,29 @@ def get_target_ids(
         Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
         Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
         """
-        device = assistant_candidate_ids.device
-        target_candidate_ids = (
-            assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :]
-            .cpu()
-            .apply_(lambda x: self._assistant_to_target_input_ids.get(x, x))
-            .to(device)
-        )
-        return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1)
+
+        num_new_tokens = len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]
+        if num_new_tokens == 0:
+            return target_input_ids
+        else:
+            transformed_slice = self._assistant_to_target_input_ids[assistant_candidate_ids[0, -num_new_tokens:]]
+            return torch.cat((target_input_ids, transformed_slice.unsqueeze(0)), dim=1)
 
     def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
         """
         Return the target logits that correspond to the assistant logits.
         """
-        device = assistant_logits.device
-        target_vocab_size: int = len(self._target_tokenizer.get_vocab())
-        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size)
-        target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(device)
-        assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf")
-        assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[
-            -1
-        ]
-        target_logits_supported_indices: torch.IntTensor = (
-            assistant_logits_supported_indices.cpu()
-            .apply_(lambda x: self._assistant_to_target_input_ids[x])
-            .to(device)
-        )
-        target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask]
+
+        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], self.target_vocab_size)
+        target_logits: torch.FloatTensor = torch.full(target_shape, self.filter_value).to(self._assistant_model_device)
+        # Mask for valid indices
+        assistant_indices_mask = self._assistant_to_target_input_ids != self.suppress_tokens_id
+        # Exclude invalid indices
+        target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_indices_mask]
+        valid_assistant_logits = assistant_logits[..., : self._assistant_to_target_input_ids.shape[0]]
+
+        target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask]
+
         return target_logits
 
 
@@ -643,7 +648,11 @@ class AssistantVocabTranslatorCache:
 
     @classmethod
     def get_translator(
-        cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"
+        cls,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        assistant_model_device,
+        target_vocab_size: int,
     ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
@@ -653,7 +662,9 @@ def get_translator(
 
             mapping = assistant_dict.get(assistant_tokenizer)
             if mapping is None:
-                mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer)
+                mapping = AssistantToTargetTranslator(
+                    target_tokenizer, assistant_tokenizer, assistant_model_device, target_vocab_size
+                )
                 assistant_dict[assistant_tokenizer] = mapping
 
             return mapping
@@ -692,11 +703,14 @@ def __init__(
         assistant_tokenizer: "PreTrainedTokenizerBase",
         generation_config: "GenerationConfig",
         model_kwargs: Dict,
+        target_vocab_size: int,
         inputs_tensor: Optional[torch.Tensor] = None,
         logits_processor: "LogitsProcessorList" = None,
     ):
         # Initialize translator before parent class
-        self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+        self._atm_translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer, assistant_tokenizer, assistant_model.device, target_vocab_size
+        )
         super().__init__(
             input_ids,
             assistant_model,
@@ -708,42 +722,49 @@ def __init__(
             logits_processor,
         )
         # Track sequence lengths and previous assistant IDs
-        self._prev_target_seq_len: int = 0
+        self._target_seq_len_with_candidates: int = 0
         self._prev_assistant_ids: Optional[torch.LongTensor] = None
+        self.target_vocab_size = target_vocab_size
 
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
         Simplified version of get_candidates that uses the translator cache for token conversion.
         """
-        input_ids = input_ids.to(self.assistant_model.device)
-        target_input_ids = input_ids.clone()
-        assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids)
+        target_input_ids = input_ids.to(self.assistant_model.device)
+        assistant_input_ids, num_added_tokens = self._prepare_assistant_input_ids(target_input_ids)
         min_new_tokens, max_new_tokens = self._calculate_new_tokens(target_input_ids)
 
         if max_new_tokens == 0:
             return input_ids, None
 
-        self._update_past_and_masks(assistant_input_ids)
+        self._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens)
         generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
-        self.assistant_kwargs.pop("attention_mask", None)
 
         # Ensure scores are returned
         generation_args["generation_config"].output_scores = True
         generation_args["generation_config"].return_dict_in_generate = True
 
         # Generate and process outputs using translator
-        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
-        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-
-        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        generation_args["logits_processor"] = self._atm_translator.logits_processors
+        self._prev_assistant_ids, assistant_candidate_logits = self._generate_candidates(generation_args)
 
         # Use translator to convert tokens and logits
-        candidate_ids = assistant_output.sequences
-        candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits)
-        target_ids = self._atm_translator.get_target_ids(assistant_input_ids, target_input_ids, candidate_ids)
-        target_logits = self._atm_translator.get_target_logits(candidate_logits)
+        target_candidate_ids = self._atm_translator.get_target_ids(
+            assistant_input_ids, target_input_ids, self._prev_assistant_ids
+        )
+        self._target_seq_len_with_candidates = target_candidate_ids.shape[-1]
+        target_candidate_logits = self._atm_translator.get_target_logits(assistant_candidate_logits)
 
-        return target_ids, target_logits
+        return target_candidate_ids, target_candidate_logits
+
+    def _update_past_and_masks(self, assistant_input_ids: torch.LongTensor, num_added_tokens: int = 1) -> bool:
+        if self._prev_assistant_ids is None:
+            # Prepare attention mask for the first generation.
+            # For subsequent generations, the attention mask is updated in super()_update_past_and_masks.
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, assistant_input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
+            )
+        return super()._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens)
 
     def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> torch.LongTensor:
         """
@@ -751,9 +772,11 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
         """
         # Calculate new tokens since last call
         target_seq_len = target_input_ids.shape[-1]
-        new_token_count = target_seq_len - self._prev_target_seq_len
+        if self._target_seq_len_with_candidates == 0:
+            new_token_count = target_seq_len
+        else:
+            new_token_count = 1
         target_new_ids = target_input_ids[:, -new_token_count:]
-        self._prev_target_seq_len = target_seq_len
 
         # Convert only the new tokens
         target_new_text = self.target_tokenizer.batch_decode(
@@ -765,11 +788,16 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
 
         # Update or initialize assistant IDs
         if self._prev_assistant_ids is None:
-            self._prev_assistant_ids = assistant_new_ids
+            assistant_input_ids = assistant_new_ids
         else:
-            self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
-
-        return self._prev_assistant_ids
+            tokens_to_remove = self._target_seq_len_with_candidates + 1 - target_seq_len
+            # If the number of new tokens is greater than zero, truncate the previous assistant IDs
+            if tokens_to_remove > 0:
+                self._prev_assistant_ids = self._prev_assistant_ids[:, :-tokens_to_remove]
+            assistant_input_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+        assistant_input_ids = assistant_input_ids.to(torch.int)
+
+        return assistant_input_ids, len(assistant_new_ids[0])
 
 
 class PromptLookupCandidateGenerator(CandidateGenerator):
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 5fcd35c921af86..e818b266cd7b7c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1860,14 +1860,15 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     ```
     """
 
-    def __init__(self, suppress_tokens, device: str = "cpu"):
+    def __init__(self, suppress_tokens, device: str = "cpu", filter_value: float = -float("Inf")):
         self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device)
+        self.filter_value = filter_value
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device))
-        scores = torch.where(suppress_token_mask, -float("inf"), scores)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens)
+        scores = torch.where(suppress_token_mask, self.filter_value, scores)
         return scores
 
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 7a9d78168ac903..d7d9757d3e4f39 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -858,6 +858,8 @@ def _get_candidate_generator(
                         logits_processor=logits_processor,
                         target_tokenizer=target_tokenizer,
                         assistant_tokenizer=assistant_tokenizer,
+                        # required in the case that self.config.vocab_size is different from the length of target_tokenizer.get_vocab()
+                        target_vocab_size=self.config.vocab_size,
                     )
                 case False:
                     candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 260f92109b7bb2..dd7e427a3bfda9 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -79,7 +79,7 @@ def test_get_assistant_to_target_input_ids(self):
     def test_get_suppress_input_ids(self):
         """Test the suppression of assistant input IDs not present in the target vocabulary."""
         expected_suppress_ids = [4]
-        actual_suppress_ids = self.translator.suppress_input_ids
+        actual_suppress_ids = self.translator._suppress_input_ids
         self.assertEqual(actual_suppress_ids, expected_suppress_ids)
 
     def test_get_target_ids(self):

From 4e92e9ced98340f3b183453ba8e67a0438851e11 Mon Sep 17 00:00:00 2001
From: Gaurav <gauravj@d-matrix.ai>
Date: Thu, 12 Dec 2024 02:30:25 +0000
Subject: [PATCH 61/76] Add unittests for Universal Assisted generation

---
 tests/test_universal_assisted_generation.py | 119 ++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 tests/test_universal_assisted_generation.py

diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py
new file mode 100644
index 00000000000000..8eae5a71de0749
--- /dev/null
+++ b/tests/test_universal_assisted_generation.py
@@ -0,0 +1,119 @@
+import unittest
+
+from zmq import device
+import torch
+import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator
+
+logging.basicConfig(level=logging.DEBUG, format='%(message)s')
+
+if torch.cuda.is_available():
+    device = "cuda"
+
+class TestUniversalSpeculativeDecoding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Setup main and assistant models
+        cls.main_model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-3.2-1B-Instruct").to(device)
+        cls.assistant_model = AutoModelForCausalLM.from_pretrained(
+            "Qwen/Qwen2.5-0.5B-Instruct").to(device)
+        cls.main_tokenizer = AutoTokenizer.from_pretrained(
+            "meta-llama/Llama-3.2-1B-Instruct")
+        cls.assistant_tokenizer = AutoTokenizer.from_pretrained(
+            "Qwen/Qwen2.5-0.5B-Instruct")
+        cls.generation_config = GenerationConfig()
+
+        # Ensure required tokens exist
+        if cls.main_tokenizer.pad_token_id is None:
+            cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id
+        if cls.main_tokenizer.bos_token_id is None:
+            cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id
+
+    def setUp(self):
+        self.input_ids = torch.tensor([[1, 2, 3]]).to(device)
+        self.model_kwargs = {
+            "attention_mask": torch.ones_like(self.input_ids).to(device),
+        }
+        self.generator = UniversalSpeculativeDecodingGenerator(
+            input_ids=self.input_ids,
+            assistant_model=self.assistant_model,
+            target_tokenizer=self.main_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            generation_config=self.generation_config,
+            model_kwargs=self.model_kwargs,
+            target_vocab_size=self.main_tokenizer.vocab_size,
+        )
+
+    def test_basic_generation(self):
+        """Test basic speculative decoding works"""
+        input_text = "The quick brown fox"
+        input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt")
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+
+        self.assertIsNotNone(candidates)
+        self.assertIsNotNone(scores)
+        self.assertTrue(torch.is_tensor(candidates))
+        self.assertTrue(torch.is_tensor(scores))
+
+    def test_mismatched_vocabularies(self):
+        """Test handling of mismatched vocabularies between models"""
+        # Create input with tokens present in main but not assistant vocab
+        # Find a token that is not in the assistant tokenizer but in 
+        # the main tokenizer.
+        missing_token = next(
+            token
+            for token in self.main_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab()
+        )
+
+        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]])
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+        self.assertIsNotNone(candidates)
+
+    def test_empty_input(self):
+        if False:
+            """Test handling of empty input"""
+            input_ids = torch.tensor([[]], dtype=torch.long)
+            self.generator.input_ids = input_ids
+            with self.assertRaises(ValueError):
+                self.generator.get_candidates(input_ids)
+
+    def test_long_sequence(self):
+        if False:
+            """Test handling of very long input sequences"""
+            long_input = torch.ones((1, 2048), dtype=torch.long)
+            self.generator.input_ids = long_input
+            candidates, scores = self.generator.get_candidates(long_input)
+            self.assertLessEqual(
+                candidates.shape[1],
+                self.main_model.config.max_position_embeddings,
+            )
+
+    def test_speculation_depth(self):
+        """Test different speculation depths"""
+        input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt")
+        self.generator.input_ids = input_ids
+
+        for depth in [1, 8, 17]:
+            self.generation_config.num_assistant_tokens = depth
+            candidates, scores = self.generator.get_candidates(input_ids)
+            self.assertLessEqual(
+                candidates.shape[1] - input_ids.shape[1], depth
+            )
+
+    def test_device_consistency(self):
+        """Test handling of inputs on different devices"""
+        if torch.cuda.is_available():
+            input_ids = torch.tensor([[1, 2, 3]]).to(
+                self.generator.assistant_model.device)
+            self.generator.input_ids = input_ids
+            candidates, scores = self.generator.get_candidates(input_ids)
+            self.assertEqual(candidates.device, input_ids.device)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From a350b1cbdbdda53bf1679047f28efa656e16659c Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Wed, 18 Dec 2024 02:56:46 -0800
Subject: [PATCH 62/76] fix style

---
 src/transformers/generation/candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 0f57c1f6d56eab..26c81da0c1a5a2 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -304,7 +304,7 @@ def _update_past_and_masks(
                 self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
             )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
-            
+
         return has_past_key_values
 
     def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:

From e047adf6a414817d77843ce696f8ed11f9949ec4 Mon Sep 17 00:00:00 2001
From: jmamou <jonathan.mamou@intel.com>
Date: Wed, 18 Dec 2024 04:15:27 -0800
Subject: [PATCH 63/76] update tests

---
 tests/generation/test_candidate_generator.py | 72 +++++++++++++++-----
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index dd7e427a3bfda9..9a2b2831e5d8e9 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -64,22 +64,27 @@ def setUp(self):
 
         self.target_tokenizer.get_vocab.return_value = self.target_vocab
         self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab
+        self.assistant_model_device = "cpu"
+        self.target_vocab_size = 6
 
         # Instantiate the class under test
         self.translator = AssistantToTargetTranslator(
-            target_tokenizer=self.target_tokenizer, assistant_tokenizer=self.assistant_tokenizer
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
         )
 
     def test_get_assistant_to_target_input_ids(self):
         """Test the mapping from assistant tokens to target tokens."""
-        expected_mapping = {0: 0, 1: 1, 2: 2}
-        actual_mapping = self.translator._assistant_to_target_input_ids
+        expected_mapping = [0, 1, 2, self.translator.suppress_tokens_id, self.translator.suppress_tokens_id]
+        actual_mapping = self.translator._assistant_to_target_input_ids.tolist()
         self.assertEqual(actual_mapping, expected_mapping)
 
     def test_get_suppress_input_ids(self):
         """Test the suppression of assistant input IDs not present in the target vocabulary."""
-        expected_suppress_ids = [4]
-        actual_suppress_ids = self.translator._suppress_input_ids
+        expected_suppress_ids = [3, 4]
+        actual_suppress_ids = self.translator._get_suppress_input_ids().tolist()
         self.assertEqual(actual_suppress_ids, expected_suppress_ids)
 
     def test_get_target_ids(self):
@@ -89,8 +94,8 @@ def test_get_target_ids(self):
         assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]])  # 'hello world foo baz' in assistant tokenizer
 
         expected_target_ids = torch.LongTensor(
-            [[0, 1, 2, 4]]
-        )  # 'hello world foo baz' in target tokenizer (baz id remains 4)
+            [[0, 1, 2, self.translator.suppress_tokens_id]]
+        )  # 'hello world foo baz' in target tokenizer (baz is mapped to self.translator.suppress_tokens_id since it does not exist in target vocab)
 
         actual_target_ids = self.translator.get_target_ids(
             assistant_input_ids, target_input_ids, assistant_candidate_ids
@@ -100,10 +105,10 @@ def test_get_target_ids(self):
     def test_get_target_logits(self):
         """Test the conversion of assistant logits to target logits."""
         # Assistant logits for IDs 0, 1, 2
-        assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3]]])  # Shape (1, 1, 3)
+        assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3, 0.4, self.translator.filter_value]]])  # Shape (1, 1, 5)
 
         # Expected target logits (target_vocab_size = 4)
-        expected_target_logits = torch.full((1, 1, 4), -float("inf"))
+        expected_target_logits = torch.full((1, 1, self.target_vocab_size), self.translator.filter_value)
         expected_target_logits[0, 0, 0] = 0.1  # 'hello'
         expected_target_logits[0, 0, 1] = 0.2  # 'world'
         expected_target_logits[0, 0, 2] = 0.3  # 'foo'
@@ -132,18 +137,38 @@ def setUp(self):
         self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2})
         self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3})
         self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5})
+        self.assistant_model_device = "cpu"
+        self.target_vocab_size = 6
 
     def test_same_instance_for_same_tokenizers(self):
         """Test that the same translator is returned for the same tokenizers."""
-        translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
-        translator2 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
         self.assertIs(translator1, translator2, "Translators should be cached and identical")
 
     def test_different_instances_for_different_tokenizers(self):
         """Test that different tokenizers produce different translators."""
-        translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
         translator2 = AssistantVocabTranslatorCache.get_translator(
-            self.other_target_tokenizer, self.other_assistant_tokenizer
+            self.other_target_tokenizer,
+            self.other_assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
         )
         self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers")
 
@@ -154,7 +179,12 @@ def test_cache_with_weakref_key(self):
         assistant_tokenizer = MockTokenizer({"hello": 0})
 
         # Store translator in a local variable to avoid it being kept alive
-        translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+        translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer,
+            assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
         self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
 
         # Delete all strong references
@@ -177,7 +207,12 @@ def test_weakref_cache_cleanup(self):
         def create_translator():
             target_tokenizer = MockTokenizer({"hello": 0})
             assistant_tokenizer = MockTokenizer({"hello": 0})
-            translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer)
+            translator = AssistantVocabTranslatorCache.get_translator(
+                target_tokenizer,
+                assistant_tokenizer,
+                assistant_model_device=self.assistant_model_device,
+                target_vocab_size=self.target_vocab_size,
+            )
             # Create weak references before returning
             refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer))
             # Remove strong references inside the function
@@ -204,7 +239,12 @@ def test_thread_safety(self):
         translators = []
 
         def get_translator():
-            translator = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer)
+            translator = AssistantVocabTranslatorCache.get_translator(
+                self.target_tokenizer,
+                self.assistant_tokenizer,
+                assistant_model_device=self.assistant_model_device,
+                target_vocab_size=self.target_vocab_size,
+            )
             translators.append(translator)
 
         threads = [threading.Thread(target=get_translator) for _ in range(10)]

From 011f5956d23ab308855d3d2cb7c3cf31c9571a28 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gaurjain14@gmail.com>
Date: Tue, 17 Dec 2024 22:30:19 +0000
Subject: [PATCH 64/76] Remove unused import and fix `test_speculation_depth`
 test

---
 tests/test_universal_assisted_generation.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py
index 8eae5a71de0749..e6d2ea7ec30e22 100644
--- a/tests/test_universal_assisted_generation.py
+++ b/tests/test_universal_assisted_generation.py
@@ -1,6 +1,5 @@
 import unittest
 
-from zmq import device
 import torch
 import logging
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
@@ -8,8 +7,7 @@
 
 logging.basicConfig(level=logging.DEBUG, format='%(message)s')
 
-if torch.cuda.is_available():
-    device = "cuda"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 class TestUniversalSpeculativeDecoding(unittest.TestCase):
     @classmethod
@@ -99,7 +97,7 @@ def test_speculation_depth(self):
         self.generator.input_ids = input_ids
 
         for depth in [1, 8, 17]:
-            self.generation_config.num_assistant_tokens = depth
+            self.generator.num_assistant_tokens = depth
             candidates, scores = self.generator.get_candidates(input_ids)
             self.assertLessEqual(
                 candidates.shape[1] - input_ids.shape[1], depth

From 26524900c4ac2295ed40de96606ffab5a8a4789c Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gaurjain14@gmail.com>
Date: Wed, 18 Dec 2024 21:00:37 +0000
Subject: [PATCH 65/76] exclude special and reserved tokens from tokenizer for
 UAG

---
 tests/test_universal_assisted_generation.py | 35 +++++----------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py
index e6d2ea7ec30e22..8c45a0ba148622 100644
--- a/tests/test_universal_assisted_generation.py
+++ b/tests/test_universal_assisted_generation.py
@@ -1,11 +1,9 @@
 import unittest
 
 import torch
-import logging
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator
 
-logging.basicConfig(level=logging.DEBUG, format='%(message)s')
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -16,11 +14,11 @@ def setUpClass(cls):
         cls.main_model = AutoModelForCausalLM.from_pretrained(
             "meta-llama/Llama-3.2-1B-Instruct").to(device)
         cls.assistant_model = AutoModelForCausalLM.from_pretrained(
-            "Qwen/Qwen2.5-0.5B-Instruct").to(device)
+            "hf-internal-testing/tiny-random-gpt2").to(device)
         cls.main_tokenizer = AutoTokenizer.from_pretrained(
             "meta-llama/Llama-3.2-1B-Instruct")
         cls.assistant_tokenizer = AutoTokenizer.from_pretrained(
-            "Qwen/Qwen2.5-0.5B-Instruct")
+            "hf-internal-testing/tiny-random-gpt2")
         cls.generation_config = GenerationConfig()
 
         # Ensure required tokens exist
@@ -62,35 +60,16 @@ def test_mismatched_vocabularies(self):
         # Find a token that is not in the assistant tokenizer but in 
         # the main tokenizer.
         missing_token = next(
-            token
-            for token in self.main_tokenizer.get_vocab()
-            if token not in self.assistant_tokenizer.get_vocab()
+            token for token in self.main_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab() and
+               token not in self.main_tokenizer.all_special_tokens and
+               "reserved_" not in token
         )
-
-        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]])
+        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) 
         self.generator.input_ids = input_ids
         candidates, scores = self.generator.get_candidates(input_ids)
         self.assertIsNotNone(candidates)
 
-    def test_empty_input(self):
-        if False:
-            """Test handling of empty input"""
-            input_ids = torch.tensor([[]], dtype=torch.long)
-            self.generator.input_ids = input_ids
-            with self.assertRaises(ValueError):
-                self.generator.get_candidates(input_ids)
-
-    def test_long_sequence(self):
-        if False:
-            """Test handling of very long input sequences"""
-            long_input = torch.ones((1, 2048), dtype=torch.long)
-            self.generator.input_ids = long_input
-            candidates, scores = self.generator.get_candidates(long_input)
-            self.assertLessEqual(
-                candidates.shape[1],
-                self.main_model.config.max_position_embeddings,
-            )
-
     def test_speculation_depth(self):
         """Test different speculation depths"""
         input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt")

From 701edbb522c62dffba22ebc2f84d8a7d3107a38f Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gaurjain14@gmail.com>
Date: Thu, 19 Dec 2024 08:51:57 +0000
Subject: [PATCH 66/76] mv `test_universal_assisted_generation.py` to
 `generation/test_candidate_generator.py`

---
 tests/generation/test_candidate_generator.py | 90 ++++++++++++++++++
 tests/test_universal_assisted_generation.py  | 96 --------------------
 2 files changed, 90 insertions(+), 96 deletions(-)
 delete mode 100644 tests/test_universal_assisted_generation.py

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index dd7e427a3bfda9..7d005f42536ab9 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,9 +1,12 @@
 import gc
+import logging
 import threading
 import unittest
 import weakref
 from unittest.mock import MagicMock
 
+from zmq import device
+
 import numpy as np
 import torch
 
@@ -11,8 +14,11 @@
     AssistantToTargetTranslator,
     AssistantVocabTranslatorCache,
     AssistedCandidateGeneratorDifferentTokenizers,
+    UniversalSpeculativeDecodingGenerator
 )
 
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+
 
 class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
     def test_no_intersection(self):
@@ -216,3 +222,87 @@ def get_translator():
         # All translators should be the same instance
         for translator in translators:
             self.assertIs(translators[0], translator, "All translators should be identical across threads")
+
+
+class TestUniversalSpeculativeDecoding(unittest.TestCase):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.assistant_model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-gpt2").to(cls.device)
+        cls.main_tokenizer = AutoTokenizer.from_pretrained(
+            "meta-llama/Llama-3.2-1B-Instruct")
+        cls.assistant_tokenizer = AutoTokenizer.from_pretrained(
+            "hf-internal-testing/tiny-random-gpt2")
+        cls.generation_config = GenerationConfig()
+
+        # Ensure required tokens exist
+        if cls.main_tokenizer.pad_token_id is None:
+            cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id
+        if cls.main_tokenizer.bos_token_id is None:
+            cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id
+
+    def setUp(self):
+        self.input_ids = torch.tensor([[1, 2, 3]]).to(self.device)
+        self.model_kwargs = {
+            "attention_mask": torch.ones_like(self.input_ids).to(self.device),
+        }
+        self.generator = UniversalSpeculativeDecodingGenerator(
+            input_ids=self.input_ids,
+            assistant_model=self.assistant_model,
+            target_tokenizer=self.main_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            generation_config=self.generation_config,
+            model_kwargs=self.model_kwargs,
+            target_vocab_size=self.main_tokenizer.vocab_size,
+        )
+
+    def test_basic_generation(self):
+        """Test basic speculative decoding works"""
+        input_text = "The quick brown fox"
+        input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt")
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+
+        self.assertIsNotNone(candidates)
+        self.assertIsNotNone(scores)
+        self.assertTrue(torch.is_tensor(candidates))
+        self.assertTrue(torch.is_tensor(scores))
+
+    def test_mismatched_vocabularies(self):
+        """Test handling of mismatched vocabularies between models"""
+        # Create input with tokens present in main but not assistant vocab
+        # Find a token that is not in the assistant tokenizer but in 
+        # the main tokenizer.
+        missing_token = next(
+            token for token in self.main_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab() and
+               token not in self.main_tokenizer.all_special_tokens and
+               "reserved_" not in token
+        )
+        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) 
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+        self.assertIsNotNone(candidates)
+
+    def test_speculation_depth(self):
+        """Test different speculation depths"""
+        input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt")
+        self.generator.input_ids = input_ids
+
+        for depth in [1, 8, 17]:
+            self.generator.num_assistant_tokens = depth
+            candidates, scores = self.generator.get_candidates(input_ids)
+            self.assertLessEqual(
+                candidates.shape[1] - input_ids.shape[1], depth
+            )
+
+    def test_device_consistency(self):
+        """Test handling of inputs on different devices"""
+        if torch.cuda.is_available():
+            input_ids = torch.tensor([[1, 2, 3]]).to(
+                self.generator.assistant_model.device)
+            self.generator.input_ids = input_ids
+            candidates, scores = self.generator.get_candidates(input_ids)
+            self.assertEqual(candidates.device, input_ids.device)
diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py
deleted file mode 100644
index 8c45a0ba148622..00000000000000
--- a/tests/test_universal_assisted_generation.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import unittest
-
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator
-
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-class TestUniversalSpeculativeDecoding(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # Setup main and assistant models
-        cls.main_model = AutoModelForCausalLM.from_pretrained(
-            "meta-llama/Llama-3.2-1B-Instruct").to(device)
-        cls.assistant_model = AutoModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2").to(device)
-        cls.main_tokenizer = AutoTokenizer.from_pretrained(
-            "meta-llama/Llama-3.2-1B-Instruct")
-        cls.assistant_tokenizer = AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2")
-        cls.generation_config = GenerationConfig()
-
-        # Ensure required tokens exist
-        if cls.main_tokenizer.pad_token_id is None:
-            cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id
-        if cls.main_tokenizer.bos_token_id is None:
-            cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id
-
-    def setUp(self):
-        self.input_ids = torch.tensor([[1, 2, 3]]).to(device)
-        self.model_kwargs = {
-            "attention_mask": torch.ones_like(self.input_ids).to(device),
-        }
-        self.generator = UniversalSpeculativeDecodingGenerator(
-            input_ids=self.input_ids,
-            assistant_model=self.assistant_model,
-            target_tokenizer=self.main_tokenizer,
-            assistant_tokenizer=self.assistant_tokenizer,
-            generation_config=self.generation_config,
-            model_kwargs=self.model_kwargs,
-            target_vocab_size=self.main_tokenizer.vocab_size,
-        )
-
-    def test_basic_generation(self):
-        """Test basic speculative decoding works"""
-        input_text = "The quick brown fox"
-        input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt")
-        self.generator.input_ids = input_ids
-        candidates, scores = self.generator.get_candidates(input_ids)
-
-        self.assertIsNotNone(candidates)
-        self.assertIsNotNone(scores)
-        self.assertTrue(torch.is_tensor(candidates))
-        self.assertTrue(torch.is_tensor(scores))
-
-    def test_mismatched_vocabularies(self):
-        """Test handling of mismatched vocabularies between models"""
-        # Create input with tokens present in main but not assistant vocab
-        # Find a token that is not in the assistant tokenizer but in 
-        # the main tokenizer.
-        missing_token = next(
-            token for token in self.main_tokenizer.get_vocab()
-            if token not in self.assistant_tokenizer.get_vocab() and
-               token not in self.main_tokenizer.all_special_tokens and
-               "reserved_" not in token
-        )
-        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) 
-        self.generator.input_ids = input_ids
-        candidates, scores = self.generator.get_candidates(input_ids)
-        self.assertIsNotNone(candidates)
-
-    def test_speculation_depth(self):
-        """Test different speculation depths"""
-        input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt")
-        self.generator.input_ids = input_ids
-
-        for depth in [1, 8, 17]:
-            self.generator.num_assistant_tokens = depth
-            candidates, scores = self.generator.get_candidates(input_ids)
-            self.assertLessEqual(
-                candidates.shape[1] - input_ids.shape[1], depth
-            )
-
-    def test_device_consistency(self):
-        """Test handling of inputs on different devices"""
-        if torch.cuda.is_available():
-            input_ids = torch.tensor([[1, 2, 3]]).to(
-                self.generator.assistant_model.device)
-            self.generator.input_ids = input_ids
-            candidates, scores = self.generator.get_candidates(input_ids)
-            self.assertEqual(candidates.device, input_ids.device)
-
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file

From 3b89341d7dd8da4b3badd8d890edf3a0bd1022a6 Mon Sep 17 00:00:00 2001
From: gauravjain14 <41287729+gauravjain14@users.noreply.github.com>
Date: Thu, 19 Dec 2024 22:54:48 -0800
Subject: [PATCH 67/76] Remove unused imports and fix style using `make style`
 (#9)

---
 tests/generation/test_candidate_generator.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index f4e7bba1dd2835..80065c16080462 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,24 +1,20 @@
 import gc
-import logging
 import threading
 import unittest
 import weakref
 from unittest.mock import MagicMock
 
-from zmq import device
-
 import numpy as np
 import torch
 
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 from transformers.generation.candidate_generator import (
     AssistantToTargetTranslator,
     AssistantVocabTranslatorCache,
     AssistedCandidateGeneratorDifferentTokenizers,
-    UniversalSpeculativeDecodingGenerator
+    UniversalSpeculativeDecodingGenerator,
 )
 
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-
 
 class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
     def test_no_intersection(self):
@@ -313,7 +309,7 @@ def test_basic_generation(self):
     def test_mismatched_vocabularies(self):
         """Test handling of mismatched vocabularies between models"""
         # Create input with tokens present in main but not assistant vocab
-        # Find a token that is not in the assistant tokenizer but in 
+        # Find a token that is not in the assistant tokenizer but in
         # the main tokenizer.
         missing_token = next(
             token for token in self.main_tokenizer.get_vocab()
@@ -321,7 +317,7 @@ def test_mismatched_vocabularies(self):
                token not in self.main_tokenizer.all_special_tokens and
                "reserved_" not in token
         )
-        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) 
+        input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]])
         self.generator.input_ids = input_ids
         candidates, scores = self.generator.get_candidates(input_ids)
         self.assertIsNotNone(candidates)

From e43dba87e79ec81c6ebf61f27be3af068ba7dcba Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Sat, 21 Dec 2024 13:52:27 -0500
Subject: [PATCH 68/76] formatting

---
 tests/generation/test_candidate_generator.py | 27 +++++++++-----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 80065c16080462..8f3e024ae3588b 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -265,12 +265,11 @@ class TestUniversalSpeculativeDecoding(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.assistant_model = AutoModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2").to(cls.device)
-        cls.main_tokenizer = AutoTokenizer.from_pretrained(
-            "meta-llama/Llama-3.2-1B-Instruct")
-        cls.assistant_tokenizer = AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2")
+        cls.assistant_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(
+            cls.device
+        )
+        cls.main_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
+        cls.assistant_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         cls.generation_config = GenerationConfig()
 
         # Ensure required tokens exist
@@ -312,10 +311,11 @@ def test_mismatched_vocabularies(self):
         # Find a token that is not in the assistant tokenizer but in
         # the main tokenizer.
         missing_token = next(
-            token for token in self.main_tokenizer.get_vocab()
-            if token not in self.assistant_tokenizer.get_vocab() and
-               token not in self.main_tokenizer.all_special_tokens and
-               "reserved_" not in token
+            token
+            for token in self.main_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab()
+            and token not in self.main_tokenizer.all_special_tokens
+            and "reserved_" not in token
         )
         input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]])
         self.generator.input_ids = input_ids
@@ -330,15 +330,12 @@ def test_speculation_depth(self):
         for depth in [1, 8, 17]:
             self.generator.num_assistant_tokens = depth
             candidates, scores = self.generator.get_candidates(input_ids)
-            self.assertLessEqual(
-                candidates.shape[1] - input_ids.shape[1], depth
-            )
+            self.assertLessEqual(candidates.shape[1] - input_ids.shape[1], depth)
 
     def test_device_consistency(self):
         """Test handling of inputs on different devices"""
         if torch.cuda.is_available():
-            input_ids = torch.tensor([[1, 2, 3]]).to(
-                self.generator.assistant_model.device)
+            input_ids = torch.tensor([[1, 2, 3]]).to(self.generator.assistant_model.device)
             self.generator.input_ids = input_ids
             candidates, scores = self.generator.get_candidates(input_ids)
             self.assertEqual(candidates.device, input_ids.device)

From a5297951adca2c2a9946b3fb5771df58610f3991 Mon Sep 17 00:00:00 2001
From: gauravjain14 <41287729+gauravjain14@users.noreply.github.com>
Date: Sat, 21 Dec 2024 15:42:17 -0800
Subject: [PATCH 69/76] Swap gated `meta-llama/llama-3.2` with `allenai/llama`
 (#10)

---
 tests/generation/test_candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 8f3e024ae3588b..2fcda92a9e1b86 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -268,7 +268,7 @@ def setUpClass(cls):
         cls.assistant_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(
             cls.device
         )
-        cls.main_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
+        cls.main_tokenizer = AutoTokenizer.from_pretrained("allenai/Llama-3.1-Tulu-3-8B-SFT")
         cls.assistant_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         cls.generation_config = GenerationConfig()
 

From 25cd5da88016a6afa1ace506c61829d811eae75b Mon Sep 17 00:00:00 2001
From: Jonathan Mamou <jonathan.mamou@intel.com>
Date: Thu, 9 Jan 2025 17:13:09 +0200
Subject: [PATCH 70/76] Fix space sign disagreement (#12)

* default values for AssistantToTargetTranslator fileds

* fix space sign

* minor

* fix test + style
---
 .../generation/candidate_generator.py         | 37 ++++++++++++++++---
 tests/generation/test_candidate_generator.py  |  6 +++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 26c81da0c1a5a2..5ba09a9d518d88 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -627,15 +627,18 @@ def __init__(
         self,
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
-        assistant_model_device,
-        target_vocab_size: int,
+        assistant_model_device: str = "cpu",
+        target_vocab_size: int = None,
         filter_value: float = -float("Inf"),
         suppress_tokens_id: int = -1,
     ):
         self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
         self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
-        self._assistant_model_device = assistant_model_device
-        self.target_vocab_size: int = target_vocab_size
+        self._assistant_model_device: str = assistant_model_device
+        if target_vocab_size:
+            self.target_vocab_size: int = target_vocab_size
+        else:
+            self.target_vocab_size: int = len(self._target_tokenizer.get_vocab())
         self.filter_value: float = filter_value
         self.suppress_tokens_id: int = suppress_tokens_id
         self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
@@ -646,6 +649,28 @@ def __init__(
     def _get_assistant_to_target_input_ids(self):
         target_vocab = self._target_tokenizer.get_vocab()
         assistant_vocab = self._assistant_tokenizer.get_vocab()
+
+        space_str = " "
+        target_space_ids = self._target_tokenizer(space_str, add_special_tokens=False)["input_ids"]
+        if len(target_space_ids) > 0:
+            target_space_sign = self._target_tokenizer.convert_ids_to_tokens(target_space_ids)[0][0]
+
+            assistant_space_ids = self._assistant_tokenizer(space_str, add_special_tokens=False)["input_ids"]
+            if len(assistant_space_ids) > 0:
+                assistant_space_sign = self._assistant_tokenizer.convert_ids_to_tokens(assistant_space_ids)[0][0]
+
+                if target_space_sign != assistant_space_sign:
+                    # If the assistant tokenizer has a different space sign than the target tokenizer,
+                    # we need to replace the assistant space sign with the target space sign in the assistant_vocab.
+                    assistant_vocab = {
+                        (
+                            tok.replace(assistant_space_sign, target_space_sign, 1)
+                            if tok.startswith(assistant_space_sign)
+                            else tok
+                        ): idx
+                        for tok, idx in assistant_vocab.items()
+                    }
+
         max_assistant_index = max(assistant_vocab.values())
         assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int)
         for tok, idx in assistant_vocab.items():
@@ -707,8 +732,8 @@ def get_translator(
         cls,
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
-        assistant_model_device,
-        target_vocab_size: int,
+        assistant_model_device: str = "cpu",
+        target_vocab_size: int = None,
     ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 2fcda92a9e1b86..7c65f369742576 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -129,6 +129,12 @@ def __init__(self, vocab=None):
     def get_vocab(self):
         return self._vocab
 
+    def __call__(self, text, add_special_tokens=True):
+        # Mock implementation of the __call__ method
+        tokens = text.split()
+        input_ids = [self._vocab.get(token, 0) for token in tokens]
+        return {"input_ids": input_ids}
+
 
 class TestAssistantVocabTranslatorCache(unittest.TestCase):
     def setUp(self):

From 77edae266dc9d96ffea94bb9551d1ad8b27db1c9 Mon Sep 17 00:00:00 2001
From: Jonathan Mamou <jonathan.mamou@intel.com>
Date: Thu, 9 Jan 2025 17:29:02 +0200
Subject: [PATCH 71/76] Default values for some fields of assistant to target
 translator (#11)

* default values for AssistantToTargetTranslator fileds

* fix

* add support to empty logit_processors
---
 .../generation/candidate_generator.py          | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 5ba09a9d518d88..80940e3b7e0c92 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -628,7 +628,8 @@ def __init__(
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
         assistant_model_device: str = "cpu",
-        target_vocab_size: int = None,
+        default_AssistantToTargetTranslator
+        target_vocab_size: Optional[int] = None,
         filter_value: float = -float("Inf"),
         suppress_tokens_id: int = -1,
     ):
@@ -642,9 +643,13 @@ def __init__(
         self.filter_value: float = filter_value
         self.suppress_tokens_id: int = suppress_tokens_id
         self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
-        self.logits_processors: LogitsProcessorList = LogitsProcessorList(
-            [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
-        )
+        self._suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self.logits_processors: Optional[LogitsProcessorList] = None
+        if len(self._suppress_input_ids) > 0:
+            # len(self._suppress_input_ids) = 0 if the assistant vocab is a subset of the target vocab
+            self.logits_processors = LogitsProcessorList(
+                [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
+            )
 
     def _get_assistant_to_target_input_ids(self):
         target_vocab = self._target_tokenizer.get_vocab()
@@ -733,7 +738,7 @@ def get_translator(
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
         assistant_model_device: str = "cpu",
-        target_vocab_size: int = None,
+        target_vocab_size: Optional[int] = None,
     ) -> AssistantToTargetTranslator:
         with cls._lock:
             assistant_dict = cls._cache.get(target_tokenizer)
@@ -826,7 +831,8 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         generation_args["generation_config"].return_dict_in_generate = True
 
         # Generate and process outputs using translator
-        generation_args["logits_processor"] = self._atm_translator.logits_processors
+        if self._atm_translator.logits_processors is not None:
+            generation_args["logits_processor"] = self._atm_translator.logits_processors
         self._prev_assistant_ids, assistant_candidate_logits = self._generate_candidates(generation_args)
 
         # Use translator to convert tokens and logits

From a2a2882bf29eb374de0ce8dc5ec85577cd72d23a Mon Sep 17 00:00:00 2001
From: Jonathan Mamou <jonathan.mamou@intel.com>
Date: Sun, 12 Jan 2025 14:35:20 +0200
Subject: [PATCH 72/76] Update candidate_generator.py (#15)

fix typo
---
 src/transformers/generation/candidate_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 80940e3b7e0c92..d2a246c81f03e7 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -628,7 +628,6 @@ def __init__(
         target_tokenizer: "PreTrainedTokenizerBase",
         assistant_tokenizer: "PreTrainedTokenizerBase",
         assistant_model_device: str = "cpu",
-        default_AssistantToTargetTranslator
         target_vocab_size: Optional[int] = None,
         filter_value: float = -float("Inf"),
         suppress_tokens_id: int = -1,

From a5569471f454ed5c06d66a25a86eeb16ec5f8bbc Mon Sep 17 00:00:00 2001
From: Jonathan Mamou <jonathan.mamou@intel.com>
Date: Sun, 12 Jan 2025 17:25:13 +0200
Subject: [PATCH 73/76] BUG fix in _prepare_assistant_input_ids (#14)

* fix _prepare_assistant_input_ids

* target_to_assistant_input_ids

* Update src/transformers/generation/candidate_generator.py

Co-authored-by: Nadav Timor <nadav.timor@weizmann.ac.il>

---------

Co-authored-by: Nadav Timor <nadav.timor@weizmann.ac.il>
---
 .../generation/candidate_generator.py         | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index d2a246c81f03e7..980505e8979db2 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -641,7 +641,7 @@ def __init__(
             self.target_vocab_size: int = len(self._target_tokenizer.get_vocab())
         self.filter_value: float = filter_value
         self.suppress_tokens_id: int = suppress_tokens_id
-        self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids()
+        self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = self._get_assistant_to_target_input_ids()
         self._suppress_input_ids: list[int] = self._get_suppress_input_ids()
         self.logits_processors: Optional[LogitsProcessorList] = None
         if len(self._suppress_input_ids) > 0:
@@ -677,10 +677,13 @@ def _get_assistant_to_target_input_ids(self):
 
         max_assistant_index = max(assistant_vocab.values())
         assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int)
-        for tok, idx in assistant_vocab.items():
-            if tok in target_vocab:
-                assistant_to_target_input_ids[idx] = target_vocab[tok]
-        return assistant_to_target_input_ids.to(self._assistant_model_device)
+        target_to_assistant_input_id: Dict[int, int] = {}
+        for tok, assistant_id in assistant_vocab.items():
+            target_id = target_vocab.get(tok)
+            if target_id is not None:
+                assistant_to_target_input_ids[assistant_id] = target_id
+                target_to_assistant_input_ids[target_id] = assistant_id
+        return assistant_to_target_input_ids.to(self._assistant_model_device), target_to_assistant_input_ids
 
     def _get_suppress_input_ids(self) -> list[int]:
         """
@@ -864,13 +867,20 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to
             new_token_count = 1
         target_new_ids = target_input_ids[:, -new_token_count:]
 
-        # Convert only the new tokens
-        target_new_text = self.target_tokenizer.batch_decode(
-            target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        assistant_new_ids = self.assistant_tokenizer(target_new_text, add_special_tokens=False, return_tensors="pt")[
-            "input_ids"
-        ].to(self.assistant_model.device)
+        # Convert the new tokens
+        assistant_new_ids = None
+        if self._target_seq_len_with_candidates > 0:
+            # we have only one new token and we can directly convert it
+            assistant_new_ids = self._atm_translator.target_to_assistant_input_ids.get(target_new_ids[0].item())
+        if assistant_new_ids is None:
+            target_new_text = self.target_tokenizer.batch_decode(
+                target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+            )
+            assistant_new_ids = self.assistant_tokenizer(
+                target_new_text, add_special_tokens=False, return_tensors="pt"
+            )["input_ids"].to(self.assistant_model.device)
+        else:
+            assistant_new_ids = torch.tensor([[assistant_new_ids]], device=self.assistant_model.device)
 
         # Update or initialize assistant IDs
         if self._prev_assistant_ids is None:

From 407d89846d553129aae702e6e882d0813f28e219 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 13 Jan 2025 10:37:49 +0200
Subject: [PATCH 74/76] typo (`target_to_assistant_input_ids`)

---
 src/transformers/generation/candidate_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 980505e8979db2..60f494c2209f33 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -677,7 +677,7 @@ def _get_assistant_to_target_input_ids(self):
 
         max_assistant_index = max(assistant_vocab.values())
         assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int)
-        target_to_assistant_input_id: Dict[int, int] = {}
+        target_to_assistant_input_ids: Dict[int, int] = {}
         for tok, assistant_id in assistant_vocab.items():
             target_id = target_vocab.get(tok)
             if target_id is not None:

From a24b1934c3ffe0a107ead0d8e73222c334155588 Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Mon, 13 Jan 2025 10:50:12 +0200
Subject: [PATCH 75/76] formatting

---
 src/transformers/generation/candidate_generator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 60f494c2209f33..954076058e0426 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -641,7 +641,9 @@ def __init__(
             self.target_vocab_size: int = len(self._target_tokenizer.get_vocab())
         self.filter_value: float = filter_value
         self.suppress_tokens_id: int = suppress_tokens_id
-        self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = self._get_assistant_to_target_input_ids()
+        self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = (
+            self._get_assistant_to_target_input_ids()
+        )
         self._suppress_input_ids: list[int] = self._get_suppress_input_ids()
         self.logits_processors: Optional[LogitsProcessorList] = None
         if len(self._suppress_input_ids) > 0:

From 1afdaa3a6299832389a2469725cdc0a5648e34da Mon Sep 17 00:00:00 2001
From: Nadav Timor <nadavtim@mit.edu>
Date: Wed, 15 Jan 2025 17:08:36 -0500
Subject: [PATCH 76/76] merge upstream/main

---
 CODEOWNERS                                    |  369 ++
 README.md                                     |   26 +-
 docker/transformers-all-latest-gpu/Dockerfile |    3 +
 docs/source/ar/_toctree.yml                   |   12 +-
 docs/source/ar/tasks/multiple_choice.md       |  452 +++
 docs/source/ar/tasks/token_classification.md  |  550 +++
 docs/source/ar/tasks/translation.md           |  407 +++
 docs/source/en/_toctree.yml                   |   14 +-
 docs/source/en/chat_templating.md             |   44 +-
 docs/source/en/deepspeed.md                   |   14 +
 docs/source/en/generation_strategies.md       |   28 +
 docs/source/en/index.md                       |    7 +
 docs/source/en/installation.md                |   52 +-
 docs/source/en/llm_optims.md                  |   11 +-
 docs/source/en/llm_tutorial.md                |    5 +-
 docs/source/en/model_doc/diffllama.md         |   59 +
 docs/source/en/model_doc/emu3.md              |  179 +
 docs/source/en/model_doc/helium.md            |  158 +
 docs/source/en/model_doc/modernbert.md        |    4 +-
 docs/source/en/model_doc/moonshine.md         |   56 +
 docs/source/en/model_doc/musicgen_melody.md   |    1 -
 docs/source/en/model_doc/qwen2_audio.md       |   31 +
 docs/source/en/model_doc/siglip.md            |    2 +-
 docs/source/en/model_doc/textnet.md           |   55 +
 docs/source/en/model_doc/vitpose.md           |  254 ++
 docs/source/en/perf_infer_gpu_one.md          |   10 +-
 docs/source/en/quantization/gptq.md           |   52 +-
 docs/source/en/quantization/overview.md       |   56 +-
 docs/source/en/quantization/torchao.md        |   17 +-
 docs/source/en/tasks/video_text_to_text.md    |    2 +-
 .../ja/model_doc/decision_transformer.md      |   25 +-
 docs/source/ko/_toctree.yml                   |   32 +-
 docs/source/ko/model_doc/altclip.md           |   78 +
 examples/flax/question-answering/run_qa.py    |    2 +-
 .../run_flax_speech_recognition_seq2seq.py    |    2 +-
 .../flax/text-classification/run_flax_glue.py |    2 +-
 .../flax/token-classification/run_flax_ner.py |    2 +-
 .../configuration_my_new_model.py             |    4 +-
 .../image_processing_new_imgproc_model.py     |    2 +-
 .../modular-transformers/modeling_dummy.py    |    2 +-
 .../modeling_multimodal1.py                   |    2 +-
 .../modeling_my_new_model2.py                 |    2 +-
 .../modeling_new_task_model.py                |    3 -
 .../modular-transformers/modeling_super.py    |    2 +-
 .../run_audio_classification.py               |    2 +-
 .../contrastive-image-text/run_clip.py        |    2 +-
 .../run_image_classification.py               |    2 +-
 .../run_image_classification_no_trainer.py    |    2 +-
 examples/pytorch/image-pretraining/run_mae.py |    2 +-
 examples/pytorch/image-pretraining/run_mim.py |    2 +-
 .../image-pretraining/run_mim_no_trainer.py   |    2 +-
 .../run_instance_segmentation.py              |    2 +-
 .../run_instance_segmentation_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_clm.py |    2 +-
 .../language-modeling/run_clm_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_fim.py |    2 +-
 .../language-modeling/run_fim_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_mlm.py |    2 +-
 .../language-modeling/run_mlm_no_trainer.py   |    2 +-
 examples/pytorch/language-modeling/run_plm.py |    2 +-
 examples/pytorch/multiple-choice/run_swag.py  |    2 +-
 .../multiple-choice/run_swag_no_trainer.py    |    2 +-
 .../object-detection/run_object_detection.py  |    2 +-
 .../run_object_detection_no_trainer.py        |    2 +-
 examples/pytorch/question-answering/run_qa.py |    2 +-
 .../question-answering/run_qa_beam_search.py  |    2 +-
 .../run_qa_beam_search_no_trainer.py          |    2 +-
 .../question-answering/run_qa_no_trainer.py   |    2 +-
 .../question-answering/run_seq2seq_qa.py      |    2 +-
 .../run_semantic_segmentation.py              |    2 +-
 .../run_semantic_segmentation_no_trainer.py   |    2 +-
 .../run_speech_recognition_ctc.py             |    2 +-
 .../run_speech_recognition_ctc_adapter.py     |    2 +-
 .../run_speech_recognition_seq2seq.py         |    2 +-
 .../summarization/run_summarization.py        |    2 +-
 .../run_summarization_no_trainer.py           |    2 +-
 .../text-classification/run_classification.py |    2 +-
 .../pytorch/text-classification/run_glue.py   |    2 +-
 .../run_glue_no_trainer.py                    |    2 +-
 .../pytorch/text-classification/run_xnli.py   |    2 +-
 .../pytorch/token-classification/run_ner.py   |    2 +-
 .../run_ner_no_trainer.py                     |    2 +-
 .../pytorch/translation/run_translation.py    |    2 +-
 .../translation/run_translation_no_trainer.py |    2 +-
 .../decision_transformer/requirements.txt     |    2 +-
 .../contrastive-image-text/run_clip.py        |    2 +-
 .../run_image_classification.py               |    2 +-
 .../tensorflow/multiple-choice/run_swag.py    |    2 +-
 .../tensorflow/question-answering/run_qa.py   |    2 +-
 .../summarization/run_summarization.py        |    2 +-
 .../text-classification/run_glue.py           |    2 +-
 .../tensorflow/translation/run_translation.py |    2 +-
 read_video.py                                 |   77 +
 run.py                                        |  107 +
 setup.py                                      |    4 +-
 src/transformers/__init__.py                  |  125 +-
 src/transformers/configuration_utils.py       |    7 +-
 src/transformers/convert_slow_tokenizer.py    |   89 +
 src/transformers/data/data_collator.py        |  113 +-
 src/transformers/dependency_versions_table.py |    2 +-
 src/transformers/generation/flax_utils.py     |    1 -
 src/transformers/generation/streamers.py      |    2 +
 src/transformers/generation/tf_utils.py       |    1 -
 src/transformers/generation/utils.py          |    7 +-
 src/transformers/image_utils.py               |  215 ++
 src/transformers/integrations/__init__.py     |    2 -
 .../integrations/flex_attention.py            |    2 +-
 src/transformers/integrations/ggml.py         |  248 --
 .../integrations/integration_utils.py         |    6 +
 src/transformers/integrations/peft.py         |   62 +
 src/transformers/loss/loss_utils.py           |    5 +-
 .../modeling_gguf_pytorch_utils.py            |  142 +-
 src/transformers/modeling_rope_utils.py       |   15 +-
 src/transformers/modeling_utils.py            |   94 +-
 src/transformers/models/__init__.py           |    7 +
 src/transformers/models/aria/modeling_aria.py |   21 +-
 src/transformers/models/aria/modular_aria.py  |    4 +-
 .../models/auto/configuration_auto.py         |   21 +-
 .../models/auto/feature_extraction_auto.py    |    1 +
 src/transformers/models/auto/modeling_auto.py |   19 +
 .../models/auto/processing_auto.py            |    2 +
 .../models/auto/tokenization_auto.py          |   10 +
 .../models/bamba/modeling_bamba.py            |   16 +-
 .../models/bart/tokenization_bart_fast.py     |   10 +-
 .../models/beit/image_processing_beit.py      |    2 +-
 .../models/bit/image_processing_bit.py        |    2 +-
 .../tokenization_blenderbot_fast.py           |   10 +-
 .../models/blip/image_processing_blip.py      |    2 +-
 src/transformers/models/blip/modeling_blip.py |    7 +
 .../models/blip_2/configuration_blip_2.py     |    3 -
 .../image_processing_bridgetower.py           |    2 +-
 .../chameleon/image_processing_chameleon.py   |    2 +-
 .../models/chameleon/modeling_chameleon.py    |  114 +-
 .../models/chameleon/processing_chameleon.py  |    1 +
 .../image_processing_chinese_clip.py          |    2 +-
 .../models/clip/image_processing_clip.py      |    2 +-
 src/transformers/models/clip/modeling_clip.py |   11 +-
 .../models/clipseg/modeling_clipseg.py        |    7 +
 .../codegen/tokenization_codegen_fast.py      |   10 -
 .../models/cohere/modeling_cohere.py          |  541 +--
 .../models/cohere/modular_cohere.py           |  400 +++
 .../models/cohere2/modeling_cohere2.py        |  372 +-
 .../models/cohere2/modular_cohere2.py         |  250 +-
 .../image_processing_conditional_detr.py      |    2 +-
 .../convnext/image_processing_convnext.py     |    2 +-
 .../deberta/tokenization_deberta_fast.py      |   11 -
 .../image_processing_deformable_detr.py       |    2 +-
 .../image_processing_deformable_detr_fast.py  |   84 +-
 .../modular_deformable_detr.py                |  144 +
 .../models/deit/image_processing_deit.py      |    2 +-
 .../deprecated/deta/image_processing_deta.py  |    2 +-
 .../image_processing_efficientformer.py       |    2 +-
 .../deprecated/tvlt/image_processing_tvlt.py  |    2 +-
 .../vit_hybrid/image_processing_vit_hybrid.py |    2 +-
 .../models/detr/image_processing_detr.py      |    2 +-
 .../models/detr/image_processing_detr_fast.py |   16 +-
 src/transformers/models/diffllama/__init__.py |   27 +
 .../diffllama/configuration_diffllama.py      |  199 ++
 .../models/diffllama/modeling_diffllama.py    | 1423 ++++++++
 .../models/diffllama/modular_diffllama.py     |  464 +++
 .../configuration_dinov2_with_registers.py    |   11 +-
 .../modeling_dinov2_with_registers.py         |   50 +-
 .../modular_dinov2_with_registers.py          |   62 +-
 .../models/donut/image_processing_donut.py    |    2 +-
 .../models/dpt/image_processing_dpt.py        |    2 +-
 .../image_processing_efficientnet.py          |    2 +-
 src/transformers/models/emu3/__init__.py      |   29 +
 .../models/emu3/configuration_emu3.py         |  327 ++
 .../models/emu3/convert_emu3_weights_to_hf.py |  448 +++
 .../models/emu3/image_processing_emu3.py      |  552 +++
 src/transformers/models/emu3/modeling_emu3.py | 1954 +++++++++++
 src/transformers/models/emu3/modular_emu3.py  | 1272 +++++++
 .../models/emu3/processing_emu3.py            |  217 ++
 .../models/encodec/modeling_encodec.py        |    2 +-
 .../models/falcon/modeling_falcon.py          |   16 +-
 .../models/flava/image_processing_flava.py    |    2 +-
 .../models/fuyu/image_processing_fuyu.py      |    2 +-
 src/transformers/models/fuyu/modeling_fuyu.py |    9 +-
 .../models/gemma/modeling_gemma.py            |   17 +-
 .../models/gemma2/modeling_gemma2.py          |   19 +-
 .../models/gemma2/modular_gemma2.py           |    2 +
 src/transformers/models/glm/modeling_glm.py   |   18 +-
 .../models/glpn/image_processing_glpn.py      |    2 +-
 .../models/gpt2/tokenization_gpt2_fast.py     |   11 -
 .../models/gpt_neox/modeling_gpt_neox.py      |   16 +-
 .../gpt_neox/tokenization_gpt_neox_fast.py    |   11 +-
 .../modeling_gpt_neox_japanese.py             |   16 +-
 .../models/granite/configuration_granite.py   |   10 +
 .../models/granite/modeling_granite.py        |   17 +-
 .../models/granitemoe/modeling_granitemoe.py  |   16 +-
 .../image_processing_grounding_dino.py        |    2 +-
 .../grounding_dino/modeling_grounding_dino.py |    4 +-
 .../models/groupvit/modeling_groupvit.py      |    7 +
 src/transformers/models/helium/__init__.py    |   27 +
 .../models/helium/configuration_helium.py     |  140 +
 .../models/helium/modeling_helium.py          | 1065 ++++++
 .../models/helium/modular_helium.py           |  171 +
 .../idefics2/image_processing_idefics2.py     |    2 +-
 .../models/idefics2/modeling_idefics2.py      |   33 -
 .../idefics3/image_processing_idefics3.py     |    2 +-
 .../models/idefics3/modeling_idefics3.py      |   13 +-
 .../imagegpt/image_processing_imagegpt.py     |    2 +-
 .../configuration_instructblip.py             |    3 -
 .../configuration_instructblipvideo.py        |    3 -
 .../image_processing_instructblipvideo.py     |    2 +-
 .../modular_instructblipvideo.py              |    3 -
 .../models/jetmoe/modeling_jetmoe.py          |   16 +-
 .../layoutlmv3/image_processing_layoutlmv3.py |    2 +-
 .../tokenization_layoutlmv3_fast.py           |   10 +-
 .../models/led/tokenization_led_fast.py       |   10 +-
 .../models/levit/image_processing_levit.py    |    2 +-
 .../models/llama/configuration_llama.py       |    2 +-
 .../models/llama/modeling_llama.py            |   17 +-
 .../models/llava/configuration_llava.py       |    4 +
 .../models/llava/modeling_llava.py            |   89 +-
 .../models/llava/processing_llava.py          |   34 +-
 .../llava_next/configuration_llava_next.py    |    4 +
 .../llava_next/image_processing_llava_next.py |    2 +-
 .../models/llava_next/modeling_llava_next.py  |   97 +-
 .../llava_next/processing_llava_next.py       |   40 +-
 .../configuration_llava_next_video.py         |    4 +
 .../image_processing_llava_next_video.py      |    2 +-
 .../modeling_llava_next_video.py              |  151 +-
 .../modular_llava_next_video.py               |  129 +-
 .../processing_llava_next_video.py            |   74 +-
 .../configuration_llava_onevision.py          |    4 +
 .../image_processing_llava_onevision.py       |    2 +-
 .../modeling_llava_onevision.py               |   16 +-
 .../processing_llava_onevision.py             |    2 +-
 .../video_processing_llava_onevision.py       |    2 +-
 .../tokenization_longformer_fast.py           |   10 +-
 .../markuplm/tokenization_markuplm_fast.py    |   10 +-
 .../image_processing_mask2former.py           |    2 +-
 .../maskformer/image_processing_maskformer.py |    2 +-
 src/transformers/models/mimi/modeling_mimi.py |   26 +-
 .../models/mistral/modeling_mistral.py        |   17 +-
 .../models/mixtral/modeling_mixtral.py        |   17 +-
 .../models/mllama/modeling_mllama.py          |    6 +-
 .../image_processing_mobilenet_v1.py          |    2 +-
 .../image_processing_mobilenet_v2.py          |    2 +-
 .../mobilevit/image_processing_mobilevit.py   |    2 +-
 .../modernbert/configuration_modernbert.py    |    5 +
 .../models/modernbert/modeling_modernbert.py  |  184 +-
 .../models/modernbert/modular_modernbert.py   |  166 +-
 src/transformers/models/moonshine/__init__.py |   27 +
 .../moonshine/configuration_moonshine.py      |  224 ++
 .../moonshine/convert_usefulsensors_to_hf.py  |  169 +
 .../models/moonshine/modeling_moonshine.py    | 1573 +++++++++
 .../models/moonshine/modular_moonshine.py     | 1135 +++++++
 .../models/moshi/modeling_moshi.py            |   41 +-
 .../models/mvp/tokenization_mvp_fast.py       |   10 +-
 .../models/nemotron/modeling_nemotron.py      |    8 +-
 .../models/nougat/image_processing_nougat.py  |    2 +-
 src/transformers/models/olmo/modeling_olmo.py |   17 +-
 .../olmo2/convert_olmo2_weights_to_hf.py      |    2 +
 .../models/olmo2/modeling_olmo2.py            |   17 +-
 .../models/olmoe/modeling_olmoe.py            |   17 +-
 .../oneformer/image_processing_oneformer.py   |    2 +-
 .../models/owlv2/image_processing_owlv2.py    |    2 +-
 .../models/owlvit/image_processing_owlvit.py  |    2 +-
 .../models/paligemma/modeling_paligemma.py    |    5 -
 .../perceiver/image_processing_perceiver.py   |    2 +-
 .../models/persimmon/modeling_persimmon.py    |   16 +-
 src/transformers/models/phi/modeling_phi.py   |   19 +-
 src/transformers/models/phi/modular_phi.py    |    4 +-
 src/transformers/models/phi3/modeling_phi3.py |  910 ++---
 src/transformers/models/phi3/modular_phi3.py  |  323 ++
 .../models/phimoe/modeling_phimoe.py          |   11 +-
 .../pix2struct/configuration_pix2struct.py    |   43 -
 .../models/pix2struct/modeling_pix2struct.py  |    8 -
 .../pixtral/convert_pixtral_weights_to_hf.py  |  148 +-
 .../pixtral/image_processing_pixtral.py       |   37 +-
 .../models/pixtral/modeling_pixtral.py        |    9 +-
 .../models/pixtral/processing_pixtral.py      |   52 +-
 .../poolformer/image_processing_poolformer.py |    2 +-
 .../models/pvt/image_processing_pvt.py        |    2 +-
 .../models/qwen2/modeling_qwen2.py            |   17 +-
 .../qwen2_audio/modeling_qwen2_audio.py       |   46 +-
 .../qwen2_audio/processing_qwen2_audio.py     |   72 +-
 .../models/qwen2_moe/modeling_qwen2_moe.py    |   26 +-
 .../qwen2_vl/image_processing_qwen2_vl.py     |    7 +-
 .../models/qwen2_vl/modeling_qwen2_vl.py      |   25 +-
 .../roberta/tokenization_roberta_fast.py      |   10 +-
 .../rt_detr/image_processing_rt_detr.py       |    2 +-
 .../rt_detr/image_processing_rt_detr_fast.py  |   63 +-
 .../models/rt_detr/modular_rt_detr.py         |  577 ++++
 .../models/sam/image_processing_sam.py        |    2 +-
 .../segformer/image_processing_segformer.py   |    2 +-
 .../models/seggpt/image_processing_seggpt.py  |    2 +-
 .../models/siglip/image_processing_siglip.py  |    2 +-
 .../models/siglip/modeling_siglip.py          |    7 +
 .../modeling_speech_encoder_decoder.py        |    3 +
 .../models/stablelm/modeling_stablelm.py      |   16 +-
 .../models/starcoder2/modeling_starcoder2.py  |   17 +-
 .../superpoint/image_processing_superpoint.py |    2 +-
 .../swin2sr/image_processing_swin2sr.py       |    2 +-
 .../models/t5/tokenization_t5_fast.py         |    1 +
 src/transformers/models/textnet/__init__.py   |   28 +
 .../models/textnet/configuration_textnet.py   |  135 +
 .../models/textnet/convert_textnet_to_hf.py   |  208 ++
 .../textnet/image_processing_textnet.py       |  355 ++
 .../models/textnet/modeling_textnet.py        |  487 +++
 .../configuration_timm_wrapper.py             |   34 +-
 .../video_llava/configuration_video_llava.py  |    4 +
 .../image_processing_video_llava.py           |    2 +-
 .../video_llava/modeling_video_llava.py       |  147 +-
 .../video_llava/processing_video_llava.py     |   14 +-
 .../videomae/image_processing_videomae.py     |    2 +-
 .../models/vilt/image_processing_vilt.py      |    2 +-
 .../models/vipllava/modeling_vipllava.py      |   77 +-
 .../modeling_vision_encoder_decoder.py        |    9 +-
 .../models/vit/image_processing_vit.py        |    2 +-
 .../vitmatte/image_processing_vitmatte.py     |    2 +-
 src/transformers/models/vitpose/__init__.py   |   28 +
 .../models/vitpose/configuration_vitpose.py   |  124 +
 .../models/vitpose/convert_vitpose_to_hf.py   |  355 ++
 .../vitpose/image_processing_vitpose.py       |  684 ++++
 .../models/vitpose/modeling_vitpose.py        |  340 ++
 .../models/vitpose_backbone/__init__.py       |   54 +
 .../configuration_vitpose_backbone.py         |  136 +
 .../modeling_vitpose_backbone.py              |  542 +++
 .../models/vivit/image_processing_vivit.py    |    2 +-
 .../models/whisper/generation_whisper.py      |   14 +-
 .../whisper/tokenization_whisper_fast.py      |    9 +-
 .../models/x_clip/modeling_x_clip.py          |    7 +
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |   18 +-
 .../models/yolos/image_processing_yolos.py    |    2 +-
 .../models/zamba/modeling_zamba.py            |  390 +--
 .../zoedepth/image_processing_zoedepth.py     |    2 +-
 src/transformers/pipelines/__init__.py        |    2 +-
 .../pipelines/audio_classification.py         |    2 +-
 .../pipelines/automatic_speech_recognition.py |    6 +
 src/transformers/pipelines/base.py            |   67 +-
 .../pipelines/document_question_answering.py  |    9 +-
 src/transformers/pipelines/image_to_text.py   |    6 +
 .../pipelines/table_question_answering.py     |    7 +
 .../pipelines/text2text_generation.py         |    6 +
 src/transformers/pipelines/text_generation.py |   45 +-
 src/transformers/pipelines/text_to_audio.py   |   13 +-
 .../pipelines/visual_question_answering.py    |   10 +-
 src/transformers/processing_utils.py          |  137 +-
 src/transformers/quantizers/quantizer_gptq.py |   43 +-
 src/transformers/testing_utils.py             |   56 +-
 src/transformers/tokenization_utils_base.py   |   11 +-
 src/transformers/tokenization_utils_fast.py   |   35 +-
 src/transformers/trainer.py                   |   12 +-
 src/transformers/trainer_callback.py          |    8 +-
 src/transformers/trainer_seq2seq.py           |    6 +-
 src/transformers/training_args.py             |   12 +-
 src/transformers/training_args_seq2seq.py     |    6 -
 src/transformers/utils/__init__.py            |    3 +
 src/transformers/utils/dummy_pt_objects.py    |  189 ++
 .../utils/dummy_vision_objects.py             |   21 +
 src/transformers/utils/import_utils.py        |   39 +-
 src/transformers/utils/notebook.py            |    4 +
 src/transformers/utils/quantization_config.py |   63 +-
 tests/deepspeed/test_deepspeed.py             |   28 +-
 tests/fsdp/test_fsdp.py                       |    3 +-
 tests/generation/test_utils.py                |   75 +-
 tests/models/blip_2/test_modeling_blip_2.py   |   18 +-
 .../chameleon/test_modeling_chameleon.py      |    2 +-
 tests/models/cohere2/test_modeling_cohere2.py |    5 -
 tests/models/dbrx/test_modeling_dbrx.py       |    2 +-
 tests/models/diffllama/__init__.py            |    0
 .../diffllama/test_modeling_diffllama.py      |  992 ++++++
 tests/models/emu3/__init__.py                 |    0
 tests/models/emu3/test_modeling_emu3.py       |  542 +++
 tests/models/emu3/test_processor_emu3.py      |   85 +
 tests/models/encodec/test_modeling_encodec.py |   23 +-
 .../test_modeling_falcon_mamba.py             |    4 +-
 tests/models/fuyu/test_modeling_fuyu.py       |    4 +-
 tests/models/helium/__init__.py               |    0
 tests/models/helium/test_modeling_helium.py   |  110 +
 tests/models/idefics/test_modeling_idefics.py |    2 +-
 .../test_modeling_instructblip.py             |    5 +-
 .../test_modeling_instructblipvideo.py        |    3 +-
 tests/models/llama/test_modeling_llama.py     |    5 +-
 tests/models/llava/test_modeling_llava.py     |   77 +-
 tests/models/llava/test_processor_llava.py    |   58 +-
 .../llava_next/test_modeling_llava_next.py    |   88 +-
 .../llava_next/test_processor_llava_next.py   |    6 +-
 .../test_modeling_llava_next_video.py         |  105 -
 .../test_processor_llava_onevision.py         |   64 +-
 tests/models/mistral/test_modeling_mistral.py |    2 +-
 tests/models/mixtral/test_modeling_mixtral.py |    5 +-
 tests/models/mllama/test_processor_mllama.py  |   62 +-
 .../test_modeling_mobilenet_v1.py             |    2 +
 .../modernbert/test_modeling_modernbert.py    |  144 +-
 tests/models/moonshine/__init__.py            |    0
 .../moonshine/test_modeling_moonshine.py      |  620 ++++
 .../models/nemotron/test_modeling_nemotron.py |    3 +-
 .../omdet_turbo/test_modeling_omdet_turbo.py  |    8 +-
 .../paligemma/test_modeling_paligemma.py      |    2 +-
 tests/models/phi3/test_modeling_phi3.py       |    3 +
 .../models/pixtral/test_processor_pixtral.py  |    2 +-
 .../qwen2_audio/test_modeling_qwen2_audio.py  |   73 +-
 .../test_image_processing_qwen2_vl.py         |   39 +-
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |    5 +-
 tests/models/rt_detr/test_modeling_rt_detr.py |   12 +-
 .../starcoder2/test_modeling_starcoder2.py    |    3 +-
 .../test_modeling_switch_transformers.py      |    2 +
 tests/models/t5/test_modeling_t5.py           |    6 +-
 tests/models/textnet/__init__.py              |    0
 .../textnet/test_image_processing_textnet.py  |  126 +
 tests/models/textnet/test_modeling_textnet.py |  348 ++
 .../test_modeling_timm_wrapper.py             |   62 +
 tests/models/upernet/test_modeling_upernet.py |    1 -
 .../video_llava/test_modeling_video_llava.py  |  127 +-
 .../models/vipllava/test_modeling_vipllava.py |   64 +-
 tests/models/vitpose/__init__.py              |    0
 .../vitpose/test_image_processing_vitpose.py  |  229 ++
 tests/models/vitpose/test_modeling_vitpose.py |  332 ++
 tests/models/vitpose_backbone/__init__.py     |    0
 .../test_modeling_vitpose_backbone.py         |  199 ++
 tests/models/zamba/test_modeling_zamba.py     |    6 +-
 .../peft_integration/test_peft_integration.py |   65 +-
 ..._pipelines_automatic_speech_recognition.py |   14 +
 .../test_pipelines_text_generation.py         |   61 +-
 tests/quantization/ggml/test_ggml.py          |    6 +-
 tests/quantization/gptq/test_gptq.py          |  125 +-
 tests/quantization/hqq/test_hqq.py            |    5 +
 .../quanto_integration/test_quanto.py         |    5 +-
 .../modular/test_conversion_order.py          |   63 +
 tests/test_image_processing_common.py         |   18 +-
 tests/test_modeling_common.py                 |   80 +-
 tests/test_processing_common.py               |   27 +-
 tests/test_tokenization_common.py             |   20 +-
 tests/tokenization/test_tokenization_utils.py |   19 +
 tests/trainer/test_data_collator.py           |   46 +
 tests/trainer/test_trainer.py                 | 3010 +++++++++--------
 tests/utils/test_cache_utils.py               |    4 +-
 tests/utils/test_modeling_rope_utils.py       |   24 +-
 tests/utils/test_modeling_utils.py            |   55 +
 utils/check_config_attributes.py              |    9 +
 utils/check_copies.py                         |    1 +
 utils/check_modular_conversion.py             |    4 +-
 utils/check_repo.py                           |    8 +
 utils/create_dependency_mapping.py            |   59 +-
 utils/modular_model_converter.py              |   20 +-
 439 files changed, 29550 insertions(+), 6272 deletions(-)
 create mode 100644 CODEOWNERS
 create mode 100644 docs/source/ar/tasks/multiple_choice.md
 create mode 100644 docs/source/ar/tasks/token_classification.md
 create mode 100644 docs/source/ar/tasks/translation.md
 create mode 100644 docs/source/en/model_doc/diffllama.md
 create mode 100644 docs/source/en/model_doc/emu3.md
 create mode 100644 docs/source/en/model_doc/helium.md
 create mode 100644 docs/source/en/model_doc/moonshine.md
 create mode 100644 docs/source/en/model_doc/textnet.md
 create mode 100644 docs/source/en/model_doc/vitpose.md
 create mode 100644 docs/source/ko/model_doc/altclip.md
 create mode 100644 read_video.py
 create mode 100644 run.py
 create mode 100644 src/transformers/models/cohere/modular_cohere.py
 create mode 100644 src/transformers/models/deformable_detr/modular_deformable_detr.py
 create mode 100644 src/transformers/models/diffllama/__init__.py
 create mode 100644 src/transformers/models/diffllama/configuration_diffllama.py
 create mode 100644 src/transformers/models/diffllama/modeling_diffllama.py
 create mode 100644 src/transformers/models/diffllama/modular_diffllama.py
 create mode 100644 src/transformers/models/emu3/__init__.py
 create mode 100644 src/transformers/models/emu3/configuration_emu3.py
 create mode 100644 src/transformers/models/emu3/convert_emu3_weights_to_hf.py
 create mode 100644 src/transformers/models/emu3/image_processing_emu3.py
 create mode 100644 src/transformers/models/emu3/modeling_emu3.py
 create mode 100644 src/transformers/models/emu3/modular_emu3.py
 create mode 100644 src/transformers/models/emu3/processing_emu3.py
 create mode 100644 src/transformers/models/helium/__init__.py
 create mode 100644 src/transformers/models/helium/configuration_helium.py
 create mode 100644 src/transformers/models/helium/modeling_helium.py
 create mode 100644 src/transformers/models/helium/modular_helium.py
 create mode 100644 src/transformers/models/moonshine/__init__.py
 create mode 100644 src/transformers/models/moonshine/configuration_moonshine.py
 create mode 100644 src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
 create mode 100644 src/transformers/models/moonshine/modeling_moonshine.py
 create mode 100644 src/transformers/models/moonshine/modular_moonshine.py
 create mode 100644 src/transformers/models/phi3/modular_phi3.py
 create mode 100644 src/transformers/models/rt_detr/modular_rt_detr.py
 create mode 100644 src/transformers/models/textnet/__init__.py
 create mode 100644 src/transformers/models/textnet/configuration_textnet.py
 create mode 100644 src/transformers/models/textnet/convert_textnet_to_hf.py
 create mode 100644 src/transformers/models/textnet/image_processing_textnet.py
 create mode 100644 src/transformers/models/textnet/modeling_textnet.py
 create mode 100644 src/transformers/models/vitpose/__init__.py
 create mode 100644 src/transformers/models/vitpose/configuration_vitpose.py
 create mode 100644 src/transformers/models/vitpose/convert_vitpose_to_hf.py
 create mode 100644 src/transformers/models/vitpose/image_processing_vitpose.py
 create mode 100644 src/transformers/models/vitpose/modeling_vitpose.py
 create mode 100644 src/transformers/models/vitpose_backbone/__init__.py
 create mode 100644 src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
 create mode 100644 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
 create mode 100644 tests/models/diffllama/__init__.py
 create mode 100644 tests/models/diffllama/test_modeling_diffllama.py
 create mode 100644 tests/models/emu3/__init__.py
 create mode 100644 tests/models/emu3/test_modeling_emu3.py
 create mode 100644 tests/models/emu3/test_processor_emu3.py
 create mode 100644 tests/models/helium/__init__.py
 create mode 100644 tests/models/helium/test_modeling_helium.py
 create mode 100644 tests/models/moonshine/__init__.py
 create mode 100644 tests/models/moonshine/test_modeling_moonshine.py
 create mode 100644 tests/models/textnet/__init__.py
 create mode 100644 tests/models/textnet/test_image_processing_textnet.py
 create mode 100644 tests/models/textnet/test_modeling_textnet.py
 create mode 100644 tests/models/vitpose/__init__.py
 create mode 100644 tests/models/vitpose/test_image_processing_vitpose.py
 create mode 100644 tests/models/vitpose/test_modeling_vitpose.py
 create mode 100644 tests/models/vitpose_backbone/__init__.py
 create mode 100644 tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
 create mode 100644 tests/repo_utils/modular/test_conversion_order.py

diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000000000..6d32e835dfa43e
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,369 @@
+# Top-level rules are matched only if nothing else matches
+* @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch
+**.md @stevhliu
+docs/ @stevhliu
+/benchmark/ @McPatate
+/docker/ @ydshieh @ArthurZucker
+
+# More high-level globs catch cases when specific rules later don't apply
+/src/transformers/models/*/*processing* @molbap @yonigozlan @qubvel
+/src/transformers/models/*/image_processing* @qubvel
+/src/transformers/models/*/image_processing_*_fast* @yonigozlan
+/src/transformers/**/*_tokenization* @ArthurZucker
+
+# Owners of subsections of the library
+/src/transformers/generation/ @gante
+/src/transformers/pipeline/ @Rocketknight1 @yonigozlan
+/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
+/src/transformers/quantizers/ @SunMarc @MekkCyber
+/src/transformers/tests/ @ydshieh
+/src/transformers/tests/generation/ @gante
+/src/transformers/models/auto/ @ArthurZucker
+/src/transformers/utils/ @ArthurZucker @Rocketknight1
+/src/transformers/loss/ @ArthurZucker
+/src/transformers/onnx/ @michaelbenayoun
+
+# Specific files come after the sections/globs, so they take priority
+/.circleci/config.yml @ArthurZucker @ydshieh
+/utils/tests_fetcher.py @ydshieh
+trainer.py @muellerzr @SunMarc
+trainer_utils.py @muellerzr @SunMarc
+/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
+
+# Owners of individual models are specific / high priority, and so they come last
+# mod* captures modeling and modular files
+
+# Text models
+/src/transformers/models/albert/mod*_albert* @ArthurZucker
+/src/transformers/models/bamba/mod*_bamba* @ArthurZucker
+/src/transformers/models/bart/mod*_bart* @ArthurZucker
+/src/transformers/models/barthez/mod*_barthez* @ArthurZucker
+/src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker
+/src/transformers/models/bert/mod*_bert* @ArthurZucker
+/src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker
+/src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker
+/src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker
+/src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker
+/src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker
+/src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker
+/src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker
+/src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker
+/src/transformers/models/bloom/mod*_bloom* @ArthurZucker
+/src/transformers/models/bort/mod*_bort* @ArthurZucker
+/src/transformers/models/byt5/mod*_byt5* @ArthurZucker
+/src/transformers/models/camembert/mod*_camembert* @ArthurZucker
+/src/transformers/models/canine/mod*_canine* @ArthurZucker
+/src/transformers/models/codegen/mod*_codegen* @ArthurZucker
+/src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker
+/src/transformers/models/cohere/mod*_cohere* @ArthurZucker
+/src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker
+/src/transformers/models/convbert/mod*_convbert* @ArthurZucker
+/src/transformers/models/cpm/mod*_cpm* @ArthurZucker
+/src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker
+/src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker
+/src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker
+/src/transformers/models/deberta/mod*_deberta* @ArthurZucker
+/src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker
+/src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker
+/src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker
+/src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker
+/src/transformers/models/dpr/mod*_dpr* @ArthurZucker
+/src/transformers/models/electra/mod*_electra* @ArthurZucker
+/src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker
+/src/transformers/models/ernie/mod*_ernie* @ArthurZucker
+/src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker
+/src/transformers/models/esm/mod*_esm* @ArthurZucker
+/src/transformers/models/falcon/mod*_falcon* @ArthurZucker
+/src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker
+/src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker
+/src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker
+/src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker
+/src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker
+/src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker
+/src/transformers/models/fnet/mod*_fnet* @ArthurZucker
+/src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker
+/src/transformers/models/funnel/mod*_funnel* @ArthurZucker
+/src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker
+/src/transformers/models/gemma/mod*_gemma* @ArthurZucker
+/src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker
+/src/transformers/models/glm/mod*_glm* @ArthurZucker
+/src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker
+/src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker
+/src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker
+/src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker
+/src/transformers/models/gptj/mod*_gptj* @ArthurZucker
+/src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker
+/src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker
+/src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker
+/src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker
+/src/transformers/models/granite/mod*_granite* @ArthurZucker
+/src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker
+/src/transformers/models/herbert/mod*_herbert* @ArthurZucker
+/src/transformers/models/ibert/mod*_ibert* @ArthurZucker
+/src/transformers/models/jamba/mod*_jamba* @ArthurZucker
+/src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker
+/src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker
+/src/transformers/models/led/mod*_led* @ArthurZucker
+/src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez 
+/src/transformers/models/longformer/mod*_longformer* @ArthurZucker
+/src/transformers/models/longt5/mod*_longt5* @ArthurZucker
+/src/transformers/models/luke/mod*_luke* @ArthurZucker
+/src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker
+/src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker
+/src/transformers/models/mamba/mod*_mamba* @ArthurZucker
+/src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker
+/src/transformers/models/marian/mod*_marian* @ArthurZucker
+/src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker
+/src/transformers/models/mbart/mod*_mbart* @ArthurZucker
+/src/transformers/models/mega/mod*_mega* @ArthurZucker
+/src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker
+/src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker
+/src/transformers/models/mistral/mod*_mistral* @ArthurZucker
+/src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker
+/src/transformers/models/mluke/mod*_mluke* @ArthurZucker
+/src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker
+/src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker
+/src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker
+/src/transformers/models/mpt/mod*_mpt* @ArthurZucker
+/src/transformers/models/mra/mod*_mra* @ArthurZucker
+/src/transformers/models/mt5/mod*_mt5* @ArthurZucker
+/src/transformers/models/mvp/mod*_mvp* @ArthurZucker
+/src/transformers/models/myt5/mod*_myt5* @ArthurZucker
+/src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker
+/src/transformers/models/nezha/mod*_nezha* @ArthurZucker
+/src/transformers/models/nllb/mod*_nllb* @ArthurZucker
+/src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker
+/src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker
+/src/transformers/models/olmo/mod*_olmo* @ArthurZucker
+/src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker
+/src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker
+/src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker
+/src/transformers/models/opt/mod*_opt* @ArthurZucker
+/src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker
+/src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker
+/src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker
+/src/transformers/models/phi/mod*_phi* @ArthurZucker
+/src/transformers/models/phi3/mod*_phi3* @ArthurZucker
+/src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker
+/src/transformers/models/phobert/mod*_phobert* @ArthurZucker
+/src/transformers/models/plbart/mod*_plbart* @ArthurZucker
+/src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker
+/src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker
+/src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker
+/src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker
+/src/transformers/models/rag/mod*_rag* @ArthurZucker
+/src/transformers/models/realm/mod*_realm* @ArthurZucker
+/src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker
+/src/transformers/models/reformer/mod*_reformer* @ArthurZucker
+/src/transformers/models/rembert/mod*_rembert* @ArthurZucker
+/src/transformers/models/retribert/mod*_retribert* @ArthurZucker
+/src/transformers/models/roberta/mod*_roberta* @ArthurZucker
+/src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker
+/src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker
+/src/transformers/models/roformer/mod*_roformer* @ArthurZucker
+/src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker
+/src/transformers/models/splinter/mod*_splinter* @ArthurZucker
+/src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker
+/src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker
+/src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker
+/src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker
+/src/transformers/models/t5/mod*_t5* @ArthurZucker
+/src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker
+/src/transformers/models/tapex/mod*_tapex* @ArthurZucker
+/src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker
+/src/transformers/models/ul2/mod*_ul2* @ArthurZucker
+/src/transformers/models/umt5/mod*_umt5* @ArthurZucker
+/src/transformers/models/xmod/mod*_xmod* @ArthurZucker
+/src/transformers/models/xglm/mod*_xglm* @ArthurZucker
+/src/transformers/models/xlm/mod*_xlm* @ArthurZucker
+/src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker
+/src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker
+/src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker
+/src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker
+/src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker
+/src/transformers/models/yoso/mod*_yoso* @ArthurZucker
+/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
+
+# Vision models
+/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
+/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
+/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
+/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
+/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
+/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
+/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
+/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
+/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
+/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
+/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
+/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
+/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
+/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
+/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
+/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
+/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
+/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
+/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
+/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
+/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
+/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
+/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
+/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
+/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
+/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
+/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
+/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
+/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
+/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
+/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
+/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
+/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
+/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
+/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
+/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
+/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
+/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
+/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
+/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
+/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
+/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
+/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
+/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
+/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
+/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
+/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
+/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
+/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
+/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
+/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
+/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
+/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
+/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
+/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
+/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
+/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
+
+# Audio models
+/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
+/src/transformers/models/bark/mod*_bark* @eustlb
+/src/transformers/models/clap/mod*_clap* @eustlb
+/src/transformers/models/dac/mod*_dac* @eustlb
+/src/transformers/models/encodec/mod*_encodec* @eustlb
+/src/transformers/models/hubert/mod*_hubert* @eustlb
+/src/transformers/models/mctct/mod*_mctct* @eustlb
+/src/transformers/models/mimi/mod*_mimi* @eustlb
+/src/transformers/models/mms/mod*_mms* @eustlb
+/src/transformers/models/moshi/mod*_moshi* @eustlb
+/src/transformers/models/musicgen/mod*_musicgen* @eustlb
+/src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb
+/src/transformers/models/pop2piano/mod*_pop2piano* @eustlb
+/src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb
+/src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb
+/src/transformers/models/sew/mod*_sew* @eustlb
+/src/transformers/models/sew_d/mod*_sew_d* @eustlb
+/src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb
+/src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb
+/src/transformers/models/speecht5/mod*_speecht5* @eustlb
+/src/transformers/models/unispeech/mod*_unispeech* @eustlb
+/src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb
+/src/transformers/models/univnet/mod*_univnet* @eustlb
+/src/transformers/models/vits/mod*_vits* @eustlb
+/src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb
+/src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb
+/src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb
+/src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb
+/src/transformers/models/wavlm/mod*_wavlm* @eustlb
+/src/transformers/models/whisper/mod*_whisper* @eustlb
+/src/transformers/models/xls_r/mod*_xls_r* @eustlb
+/src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb
+
+# Video models
+/src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1
+/src/transformers/models/videomae/mod*_videomae* @Rocketknight1
+/src/transformers/models/vivit/mod*_vivit* @Rocketknight1
+
+# Multimodal models
+/src/transformers/models/align/mod*_align* @zucchini-nlp
+/src/transformers/models/altclip/mod*_altclip* @zucchini-nlp
+/src/transformers/models/aria/mod*_aria* @zucchini-nlp
+/src/transformers/models/blip/mod*_blip* @zucchini-nlp
+/src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp
+/src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp
+/src/transformers/models/bros/mod*_bros* @zucchini-nlp
+/src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp
+/src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp
+/src/transformers/models/clip/mod*_clip* @zucchini-nlp
+/src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp
+/src/transformers/models/clvp/mod*_clvp* @zucchini-nlp
+/src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan 
+/src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp
+/src/transformers/models/deplot/mod*_deplot* @zucchini-nlp
+/src/transformers/models/donut/mod*_donut* @zucchini-nlp
+/src/transformers/models/flava/mod*_flava* @zucchini-nlp
+/src/transformers/models/git/mod*_git* @zucchini-nlp
+/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
+/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
+/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
+/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
+/src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp
+/src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp
+/src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp
+/src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp
+/src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge
+/src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge
+/src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge
+/src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge
+/src/transformers/models/lilt/mod*_lilt* @zucchini-nlp
+/src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker
+/src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp
+/src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp
+/src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp
+/src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp
+/src/transformers/models/matcha/mod*_matcha* @zucchini-nlp
+/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
+/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
+/src/transformers/models/nougat/mod*_nougat* @NielsRogge
+/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
+/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
+/src/transformers/models/owlvit/mod*_owlvit* @qubvel
+/src/transformers/models/owlv2/mod*_owlv2* @qubvel
+/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
+/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
+/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
+/src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker 
+/src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker 
+/src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker 
+/src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker 
+/src/transformers/models/siglip/mod*_siglip* @zucchini-nlp
+/src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp
+/src/transformers/models/tapas/mod*_tapas* @NielsRogge
+/src/transformers/models/trocr/mod*_trocr* @zucchini-nlp
+/src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp
+/src/transformers/models/tvp/mod*_tvp* @zucchini-nlp
+/src/transformers/models/udop/mod*_udop* @zucchini-nlp
+/src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp
+/src/transformers/models/vilt/mod*_vilt* @zucchini-nlp
+/src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp
+/src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1
+/src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1
+/src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp
+/src/transformers/models/xclip/mod*_xclip* @zucchini-nlp
+
+# Reinforcement learning models
+/src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1
+/src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1
+
+# Time series models
+/src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1
+/src/transformers/models/informer/mod*_informer* @Rocketknight1
+/src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1
+/src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1
+/src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1
+
+# Graph models
+/src/transformers/models/graphormer/mod*_graphormer* @clefourrier
+
+# Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI
+utils/dummy*
\ No newline at end of file
diff --git a/README.md b/README.md
index 42403f84b885da..8ab5ceaf7e687b 100644
--- a/README.md
+++ b/README.md
@@ -255,17 +255,37 @@ You should install 🤗 Transformers in a [virtual environment](https://docs.pyt
 
 First, create a virtual environment with the version of Python you're going to use and activate it.
 
-Then, you will need to install at least one of Flax, PyTorch, or TensorFlow.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform.
+**macOS/Linux**
+
+```python -m venv env
+source env/bin/activate
+```
+
+**Windows**
+
+``` python -m venv env
+env\Scripts\activate
+```
+
+To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands:
+
+[TensorFlow installation page](https://www.tensorflow.org/install/), 
+[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) 
 
 When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
 
-```bash
+```
 pip install transformers
 ```
 
 If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source).
 
+```
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install
+```
+
 ### With conda
 
 🤗 Transformers can be installed using conda as follows:
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index b597f5a73fb5be..b40ba3f35ff874 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -65,6 +65,9 @@ RUN python3 -m pip install --no-cache-dir python-Levenshtein
 # For `FastSpeech2ConformerTokenizer` tokenizer
 RUN python3 -m pip install --no-cache-dir g2p-en
 
+# For Some bitsandbytes tests
+RUN python3 -m pip install --no-cache-dir einops
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
index c306b0ada80691..01568e93561de4 100644
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@@ -35,20 +35,20 @@
     sections:
 #     - local: tasks/sequence_classification
 #       title: تصنيف النصوص
-#     - local: tasks/token_classification
-#       title: تصنيف الرموز
+    - local: tasks/token_classification
+      title: تصنيف الرموز
     - local: tasks/question_answering
       title: الإجابة على الأسئلة
 #     - local: tasks/language_modeling
 #       title: نمذجة اللغة السببية
 #     - local: tasks/masked_language_modeling
 #       title: نمذجة اللغة المقنعة
-#     - local: tasks/translation
-#       title: الترجمة
+    - local: tasks/translation
+      title: الترجمة
     - local: tasks/summarization
       title: التلخيص
-#     - local: tasks/multiple_choice
-#       title: الاختيار المتعدد
+    - local: tasks/multiple_choice
+      title: الاختيار المتعدد
     title: معالجة اللغات الطبيعية
 #   - isExpanded: false
 #     sections:
diff --git a/docs/source/ar/tasks/multiple_choice.md b/docs/source/ar/tasks/multiple_choice.md
new file mode 100644
index 00000000000000..78f98560754f11
--- /dev/null
+++ b/docs/source/ar/tasks/multiple_choice.md
@@ -0,0 +1,452 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الاختيار من متعدد (Multiple choice)
+
+[[open-in-colab]]
+
+مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased)  باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق.
+2. استخدام النموذج المضبوط للاستدلال.
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SWAG
+
+ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً:
+
+- `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`.
+- `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة.
+- `label`: يحدد نهاية الجملة الصحيحة.
+
+## المعالجة المسبقة (Preprocess)
+
+الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى:
+
+1.  إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة.
+2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة.
+3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة.
+
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+ 
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب.
+2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`.
+3. استدعي [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     processing_class=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+>>> model.compile(optimizer=optimizer)  # لا توجد وسيطة خسارة!
+```
+
+الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks)
+
+مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم قم بتضمين الاستدعاءات معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل.
+
+</Tip>
+
+## الاستدلال  (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+قم بإنشاء نص واقتراح إجابتين محتملتين:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+0
+```
+</pt>
+<tf>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+مرر مدخلاتك إلى النموذج وأعد القيم logits:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+0
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ar/tasks/token_classification.md b/docs/source/ar/tasks/token_classification.md
new file mode 100644
index 00000000000000..e311482aeccb06
--- /dev/null
+++ b/docs/source/ar/tasks/token_classification.md
@@ -0,0 +1,550 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+	Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+	the License. You may obtain a copy of the License at
+	http://www.apache.org/licenses/LICENSE-2.0
+	Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+	an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+	specific language governing permissions and limitations under the License.
+	⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+	rendered properly in your Markdown viewer.
+	-->
+
+# تصنيف الرموز(Token classification)
+
+[[open-in-colab]]
+
+<Youtube id="wVHdVlPScxA"/>
+
+يهدف تصنيف الرموز إلى إعطاء تسمية لكل رمز على حدة في الجملة. من أكثر مهام تصنيف الرموز شيوعًا هو التعرف على الكيانات المسماة (NER). يحاول NER تحديد تسمية لكل كيان في الجملة، مثل شخص، أو مكان، أو منظمة. 
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [WNUT 17](https://huggingface.co/datasets/wnut_17) للكشف عن كيانات جديدة.
+2.  استخدام نموذجك المضبوط بدقة للاستدلال.
+
+<Tip>
+
+للاطلاع جميع البنى والنقاط المتوافقة مع هذه المهمة، نوصي بالرجوع من [صفحة المهمة](https://huggingface.co/tasks/token-classification).
+
+</Tip>
+
+قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate seqeval
+```
+
+نحن نشجعك على تسجيل الدخول إلى حساب HuggingFace الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما يُطلب منك، أدخل رمزك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات WNUT 17
+
+ابدأ بتحميل مجموعة بيانات WNUT 17 من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+يمثل كل رقم في `ner_tags` كياناً. حوّل الأرقام إلى أسماء التصنيفات لمعرفة ماهية الكيانات:
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+
+يشير الحرف الذي يسبق كل `ner_tag` إلى موضع الرمز للكيان:
+
+- `B-` يشير إلى بداية الكيان.
+- `I-` يشير إلى أن الرمز يقع ضمن نفس الكيان (على سبيل المثال، الرمز `State` هو جزء من كيان مثل `Empire State Building`).
+- `0` يشير إلى أن الرمز لا يمثل أي كيان.
+
+## المعالجة المسبقة(Preprocess)
+
+<Youtube id="iY2AZYdZAr0"/>
+
+الخطوة التالية هي تحميل مُجزِّئ النصوص DistilBERT للمعالجة المسبقة لحقل `tokens`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+كما رأيت في حقل `tokens` المثال أعلاه، يبدو أن المدخل قد تم تحليله بالفعل. لكن المدخل  لم يُجزأ بعد ويتعيّن عليك ضبط `is_split_into_words=True` لتقسيم الكلمات إلى كلمات فرعية. على سبيل المثال:
+
+```py
+>>> example = wnut["train"][0]
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+ومع ذلك، يضيف هذا بعض الرموز الخاصة `[CLS]` و`[SEP]` وتقسيم الكلمات إلى أجزاء يُنشئ عدم تطابق بين المُدخلات والتسميات. قد يتم تقسيم كلمة واحدة تقابل تسمية واحدة الآن إلى كلمتين فرعيتين. ستحتاج إلى إعادة محاذاة الرموز والتسميات عن طريق:
+
+1. ربط كل رمز بالكلمة الأصلية باستخدام الخاصية [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids).
+2. تعيين التسمية `-100` للرموز الخاصة `[CLS]` و`[SEP]` بحيث يتم تجاهلها بواسطة دالة الخسارة PyTorch (انظر [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)).
+3. تسمية الرمز الأول فقط لكلمة معينة. قم بتعيين `-100` لأجزاء الكلمة الأخرى.
+
+هنا كيف يمكنك إنشاء وظيفة لإعادة محاذاة الرموز والتسميات، وقص الجمل لتتجاوز الحد الأقصى لطول مُدخلات DistilBERT:
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # تعيين الرموز إلى كلماتهم المقابلة.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # تعيين الرموز الخاصة إلى -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # تسمية الرمز الأول فقط لكلمة معينة.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+لتطبيق هذه العملية على كامل مجموعة البيانات، استخدم الدالة [`~datasets.Dataset.map`] لمجموعة بيانات 🤗. يمكنك تسريع الدالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].من الأفضل استخدام *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بالكامل إلى الطول الأقصى.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم(Evaluate)
+
+يُعدّ تضمين مقياس أثناء التدريب مفيدًا في تقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة مع مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل إطار [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) (انظر جولة 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس). يُخرج seqeval عدة نتائج: الدقة، والاستذكار، ومقياس F1، والدقة.
+
+```py
+>>> import evaluate
+
+>>> seqeval = evaluate.load("seqeval")
+```
+
+احصل على تسميات الكيانات المسماة (NER) أولاً،ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك الصحيحة إلى [`~evaluate.EvaluationModule.compute`] لحساب النتائج:
+
+```py
+>>> import numpy as np
+
+>>> labels = [label_list[i] for i in example[f"ner_tags"]]
+
+>>> def compute_metrics(p):
+...     predictions, labels = p
+...     predictions = np.argmax(predictions, axis=2)
+
+...     true_predictions = [
+...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+...     true_labels = [
+...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+
+...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
+...     return {
+...         "precision": results["overall_precision"],
+...         "recall": results["overall_recall"],
+...         "f1": results["overall_f1"],
+...         "accuracy": results["overall_accuracy"],
+...     }
+```
+
+دالة `compute_metrics` جاهزة للاستخدام، وستحتاج إليها عند إعداد التدريب.
+
+## التدريب(Train)
+
+قبل تدريب النموذج، جهّز خريطة تربط بين المعرّفات المتوقعة وتسمياتها باستخدام `id2label` و `label2id`:
+
+```py
+>>> id2label = {
+...     0: "O",
+...     1: "B-corporation",
+...     2: "I-corporation",
+...     3: "B-creative-work",
+...     4: "I-creative-work",
+...     5: "B-group",
+...     6: "I-group",
+...     7: "B-location",
+...     8: "I-location",
+...     9: "B-person",
+...     10: "I-person",
+...     11: "B-product",
+...     12: "I-product",
+... }
+>>> label2id = {
+...     "O": 0,
+...     "B-corporation": 1,
+...     "I-corporation": 2,
+...     "B-creative-work": 3,
+...     "I-creative-work": 4,
+...     "B-group": 5,
+...     "I-group": 6,
+...     "B-location": 7,
+...     "I-location": 8,
+...     "B-person": 9,
+...     "I-person": 10,
+...     "B-product": 11,
+...     "I-product": 12,
+... }
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForTokenClassification`] إلى جانب عدد التصنيفات المتوقعة، وخريطة التسميات:
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+في هذه المرحلة، هناك ثلاث خطوات فقط متبقية:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم درجات seqeval وحفظ تسخة التدريب.
+2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، ومجموعة البيانات، والمُجزِّئ اللغوي، و`data collator`، ودالة `compute_metrics`.
+3.استدعِ [`~Trainer.train`] لتدريب نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_wnut_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+للتعديل على نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+
+ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForTokenClassification`] إلى جانب عدد التسميات المتوقعة، وتخطيطات التسميات:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` مع [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_wnut["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_wnut["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+هيّئ النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers تتضمن دالة خسارة افتراضية مرتبطة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب درجات seqeval من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك والمحلل اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_wnut_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم جمّع callbacks الخاصة بك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز الآن لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع بيانات التدريب والتحقق، وعدد الحقبات، وcallbacks لتعديل النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تفصيلاً حول كيفية تعديل نموذج لتصنيف الرموز، ألق نظرة على الدفتر المقابل
+[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
+أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+
+</Tip>
+
+## الاستدلال(Inference)
+
+رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!
+
+احصل على بعض النصوص التي تريد تشغيل الاستدلال عليها:
+
+```py
+>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
+```
+
+أبسط طريقة لتجربة نموذجك المُدرب مسبقًا للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتصنيف الكيانات المسماة مع نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
+>>> classifier(text)
+[{'entity': 'B-location',
+  'score': 0.42658573,
+  'index': 2,
+  'word': 'golden',
+  'start': 4,
+  'end': 10},
+ {'entity': 'I-location',
+  'score': 0.35856336,
+  'index': 3,
+  'word': 'state',
+  'start': 11,
+  'end': 16},
+ {'entity': 'B-group',
+  'score': 0.3064001,
+  'index': 4,
+  'word': 'warriors',
+  'start': 17,
+  'end': 25},
+ {'entity': 'B-location',
+  'score': 0.65523505,
+  'index': 13,
+  'word': 'san',
+  'start': 80,
+  'end': 83},
+ {'entity': 'B-location',
+  'score': 0.4668663,
+  'index': 14,
+  'word': 'francisco',
+  'start': 84,
+  'end': 93}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قسّم النص إلى رموز وأرجع المُوتّرات بلغة PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+مرر مدخلاتك إلى النموذج واحصل على `logits`:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية:
+
+```py
+>>> predictions = torch.argmax(logits, dim=2)
+>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</pt>
+<tf>
+قسّم النص إلى رموز وأرجع المُوتّرات ب TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+مرر مدخلاتك إلى النموذج واحصل على `logits`:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية:
+
+```py
+>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ar/tasks/translation.md b/docs/source/ar/tasks/translation.md
new file mode 100644
index 00000000000000..6245b903c22d63
--- /dev/null
+++ b/docs/source/ar/tasks/translation.md
@@ -0,0 +1,407 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الترجمة(Translation)
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية.
+2. استخدام النموذج المضبوط بدقة للاستدلال.
+
+<Tip>
+
+لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation).
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات OPUS Books
+
+ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+ثم ألقِ نظرة على مثال:
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+`translation`: ترجمة إنجليزية وفرنسية للنص.
+
+## المعالجة المسبقة(Preprocess)
+
+<Youtube id="XAR8jnZZuUs"/>
+
+الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "google-t5/t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي:
+
+1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة.
+2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي.
+3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`.
+
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU:
+
+```py
+>>> import numpy as np
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+
+دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب.
+2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`.
+3. نفّذ [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True, #change to bf16=True for XPU
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب:
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع استدعاءاتك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+
+</Tip>
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه:
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه:
+
+```py
+>>> from transformers import pipeline
+
+# تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة.
+# أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية
+# يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages
+>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+</pt>
+<tf>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a076f704b8ede2..40780d24d51c49 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -386,6 +386,8 @@
         title: DeBERTa-v2
       - local: model_doc/dialogpt
         title: DialoGPT
+      - local: model_doc/diffllama
+        title: DiffLlama
       - local: model_doc/distilbert
         title: DistilBERT
       - local: model_doc/dpr
@@ -450,6 +452,8 @@
         title: Granite
       - local: model_doc/granitemoe
         title: GraniteMoe
+      - local: model_doc/helium
+        title: Helium
       - local: model_doc/herbert
         title: HerBERT
       - local: model_doc/ibert
@@ -504,6 +508,8 @@
         title: MobileBERT
       - local: model_doc/modernbert
         title: ModernBert
+      - local: model_doc/moonshine
+        title: moonshine
       - local: model_doc/mpnet
         title: MPNet
       - local: model_doc/mpt
@@ -719,6 +725,8 @@
         title: Swin2SR
       - local: model_doc/table-transformer
         title: Table Transformer
+      - local: model_doc/textnet
+        title: TextNet
       - local: model_doc/timm_wrapper
         title: Timm Wrapper
       - local: model_doc/upernet
@@ -737,6 +745,8 @@
         title: ViTMatte
       - local: model_doc/vit_msn
         title: ViTMSN
+      - local: model_doc/vitpose
+        title: ViTPose
       - local: model_doc/yolos
         title: YOLOS
       - local: model_doc/zoedepth
@@ -754,8 +764,6 @@
         title: dac
       - local: model_doc/encodec
         title: EnCodec
-      - local: model_doc/hiera
-        title: Hiera
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
@@ -854,6 +862,8 @@
         title: DePlot
       - local: model_doc/donut
         title: Donut
+      - local: model_doc/emu3
+        title: Emu3
       - local: model_doc/flava
         title: FLAVA
       - local: model_doc/git
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index 0108cb48e95cee..c44420543cf89f 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -23,7 +23,7 @@ of text (as is the case with a standard language model), the model instead conti
 of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text.
 
 Much like tokenization, different models expect very different input formats for chat. This is the reason we added
-**chat templates** as a feature. Chat templates are part of the tokenizer. They specify how to convert conversations, 
+**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, 
 represented as lists of messages, into a single tokenizable string in the format that the model expects. 
 
 Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model:
@@ -66,10 +66,12 @@ for you, allowing you to write universal code that works for any model.
 ## How do I use chat templates?
 
 As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role`
-and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] method. Once you do that,
+and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method
+depending on what type of model you are using. Once you do that,
 you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea
 to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). 
 
+## Usage with text-only LLMs
 Here's an example of preparing input for `model.generate()`, using `Zephyr` again:
 
 ```python
@@ -116,6 +118,44 @@ How many helicopters can a human eat in one sitting?</s>
 Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
 ```
 
+## Usage with multimodal LLMs
+
+For multimodal LLMs such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted in a similar way. The only difference is you need to pass input images/videos as well along with the text. Each `"content"`
+has to be a list containing either a text or an image/video.
+
+Here's an example of preparing input for using `LLaVA` model:
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)  # You may want to use bfloat16 and/or move to GPU here
+processor = AutoProcessor.from_pretrained(model_id)
+
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+          {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+          {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+
+processed_chat = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+print(processor.batch_decode(processed_chat["input_ids"][:, :30]))
+```
+This yields a string in LLaVAs expected input format with many `<image>` tokens at the end.
+The `<image>` tokens are placeholders and each one will be replaced by image embeddings when the mode is run in the forward call. The `processed_chat` can be further passed into [`~GenerationMixin.generate`] to generate text.
+```text
+'<|im_start|>system 
+You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user <image><image><image><image><image><image><image><image>'
+```
+
 Arr, 'twas easy after all!
 
 ## Is there an automated pipeline for chat?
diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md
index 7f7995c4664133..ad3d4240f85619 100644
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@@ -586,6 +586,20 @@ You can choose the communication data type by setting the `communication_data_ty
 }
 ```
 
+### Universal Checkpointing
+
+[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations.
+
+Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file.
+
+```yaml
+{
+    "checkpoint": {
+        "load_universal": true
+    }
+}
+```
+
 ## Deployment
 
 DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code.
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 47032a2a292b1b..4c4c19a3d6628d 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -96,6 +96,12 @@ distribution over the entire vocabulary with various strategy-specific adjustmen
 the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding
 strategies like greedy search and contrastive search return a single output sequence.
 
+It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument
+allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability
+distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation.
+The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external
+`generate()`-compatible extensions.
+
 ## Save a custom decoding strategy with your model
 
 If you would like to share your fine-tuned model with a specific generation configuration, you can:
@@ -435,6 +441,28 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
+<Tip>
+
+If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model`
+
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> pipe = pipeline(
+...     "text-generation",
+...     model="meta-llama/Llama-3.1-8B",
+...     assistant_model="meta-llama/Llama-3.2-1B",  # This extra line is all that's needed, also works with UAD
+...     torch_dtype=torch.bfloat16
+>>> )
+>>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
+>>> pipe_output[0]["generated_text"]
+'Once upon a time, 3D printing was a niche technology that was only'
+```
+
+</Tip>
+
+
 When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
 just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
 
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index dcecfc872d61d0..1c90fd71b3d298 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -125,6 +125,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
+|                     [DiffLlama](model_doc/diffllama)                     |       ✅        |         ❌         |      ❌      |
 |                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
 |                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
 |         [DINOv2 with Registers](model_doc/dinov2_with_registers)         |       ✅        |         ❌         |      ❌      |
@@ -136,6 +137,7 @@ Flax), PyTorch, and/or TensorFlow.
 |               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
 |                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
 |                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
+|                          [Emu3](model_doc/emu3)                          |       ✅        |         ❌         |      ❌      |
 |                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
 |               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
 |                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
@@ -171,6 +173,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
+|                        [Helium](model_doc/helium)                        |       ✅        |         ❌         |      ❌      |
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
 |                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
@@ -234,6 +237,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
 |                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
 |                    [ModernBERT](model_doc/modernbert)                    |       ✅        |         ❌         |      ❌      |
+|                     [Moonshine](model_doc/moonshine)                     |       ✅        |         ❌         |      ❌      |
 |                         [Moshi](model_doc/moshi)                         |       ✅        |         ❌         |      ❌      |
 |                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
 |                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
@@ -325,6 +329,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
 |                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
 |                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|                       [TextNet](model_doc/textnet)                       |       ✅        |         ❌         |      ❌      |
 |       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
 |                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
 |                [TimmWrapperModel](model_doc/timm_wrapper)                |       ✅        |         ❌         |      ❌      |
@@ -354,6 +359,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                       [VitPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
+|              [VitPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
index af7c97ef3508ae..ae1f2101d749b3 100644
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -32,27 +32,18 @@ Install 🤗 Transformers for whichever deep learning library you're working wit
 
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
 
-Start by creating a virtual environment in your project directory:
+Now you're ready to install 🤗 Transformers with the following command:
 
 ```bash
-python -m venv .env
+pip install transformers
 ```
 
-Activate the virtual environment. On Linux and MacOs:
+For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip).
 
-```bash
-source .env/bin/activate
-```
-Activate Virtual environment on Windows
+Run the command below to check if your system detects an NVIDIA GPU.
 
 ```bash
-.env/Scripts/activate
-```
-
-Now you're ready to install 🤗 Transformers with the following command:
-
-```bash
-pip install transformers
+nvidia-smi
 ```
 
 For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
@@ -254,3 +245,36 @@ Once your file is downloaded and locally cached, specify it's local path to load
 See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
 
 </Tip>
+
+## Troubleshooting
+
+See below for some of the more common installation issues and how to resolve them.
+
+### Unsupported Python version
+
+Ensure you are using Python 3.9 or later. Run the command below to check your Python version.
+
+```
+python --version
+```
+
+### Missing dependencies
+
+Install all required dependencies by running the following command. Ensure you’re in the project directory before executing the command.
+
+```
+pip install -r requirements.txt
+```
+
+### Windows-specific
+
+If you encounter issues on Windows, you may need to activate Developer Mode. Navigate to Windows Settings > For Developers > Developer Mode.
+
+Alternatively, create and activate a virtual environment as shown below.
+
+```
+python -m venv env
+.\env\Scripts\activate
+```
+
+
diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md
index 17ebb841de7a39..37406ea0bef298 100644
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -156,9 +156,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
 There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
 2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
-3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
+3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
 
 ```py
+from torch.nn.attention import SDPBackend, sdpa_kernel
+
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
     past_key_values = StaticCache(
@@ -179,7 +181,7 @@ with torch.no_grad():
     decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
     cache_position = torch.tensor([seq_length + 1], device=torch_device)
     for _ in range(1, NUM_TOKENS_TO_GENERATE):
-        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+        with sdpa_kernel(SDPBackend.MATH):
             next_token = decode_one_tokens(model, next_token.clone(), None, cache_position, past_key_values)
             generated_ids[:, cache_position] = next_token.int()
         cache_position += 1
@@ -453,10 +455,11 @@ Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and
 > [!TIP]
 > SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed.
 
-Use the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to explicitly enable or disable any of the three attention algorithms. For example, set `enable_flash=True` to enable FlashAttention.
+Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.
 
 ```py
 import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained(
@@ -464,7 +467,7 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
 )
 
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
     outputs = model.generate(**inputs)
 ```
 
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
index 097d7bf1e9ca38..eaf59c741cdd28 100644
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@@ -265,8 +265,9 @@ While the autoregressive generation process is relatively straightforward, makin
 
 ### Related libraries
 
-1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
+1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices;
 2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files);
-3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation. (e.g. JSON, SQL, Python)
+3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python);
 4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
 5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
+6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo).
diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md
new file mode 100644
index 00000000000000..80afcfe433e9dd
--- /dev/null
+++ b/docs/source/en/model_doc/diffllama.md
@@ -0,0 +1,59 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DiffLlama
+
+## Overview
+
+The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and .
+This model is combine Llama model and Differential Transformer's Attention.
+
+The abstract from the paper is the following:
+
+*Transformer tends to overallocate attention to irrelevant context. In this work, we introduce Diff Transformer, which amplifies attention to the relevant context while canceling noise. Specifically, the differential attention mechanism calculates attention scores as the difference between two separate softmax attention maps. The subtraction cancels noise, promoting the emergence of sparse attention patterns. Experimental results on language modeling show that Diff Transformer outperforms Transformer in various settings of scaling up model size and training tokens. More intriguingly, it offers notable advantages in practical applications, such as long-context modeling, key information retrieval, hallucination mitigation, in-context learning, and reduction of activation outliers. By being less distracted by irrelevant context, Diff Transformer can mitigate hallucination in question answering and text summarization. For in-context learning, Diff Transformer not only enhances accuracy but is also more robust to order permutation, which was considered as a chronic robustness issue. The results position Diff Transformer as a highly effective and promising architecture to advance large language models.*
+
+### Usage tips
+The hyperparameters of this model is the same as Llama model.
+
+
+## DiffLlamaConfig
+
+[[autodoc]] DiffLlamaConfig
+
+## DiffLlamaModel
+
+[[autodoc]] DiffLlamaModel
+    - forward
+
+## DiffLlamaForCausalLM
+
+[[autodoc]] DiffLlamaForCausalLM
+    - forward
+
+## DiffLlamaForSequenceClassification
+
+[[autodoc]] DiffLlamaForSequenceClassification
+    - forward
+
+## DiffLlamaForQuestionAnswering
+
+[[autodoc]] DiffLlamaForQuestionAnswering
+    - forward
+
+## DiffLlamaForTokenClassification
+
+[[autodoc]] DiffLlamaForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md
new file mode 100644
index 00000000000000..0b3220c073fb65
--- /dev/null
+++ b/docs/source/en/model_doc/emu3.md
@@ -0,0 +1,179 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Emu3
+
+## Overview
+
+The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang.
+
+Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. 
+
+
+The abstract from the paper is the following:
+
+*While next-token prediction is considered a promising path towards artificial general intelligence, it has struggled to excel in multimodal tasks, which are still dominated by diffusion models (e.g., Stable Diffusion) and compositional approaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a new suite of state-of-the-art multimodal models trained solely with next-token prediction. By tokenizing images, text, and videos into a discrete space, we train a single transformer from scratch on a mixture of multimodal sequences. Emu3 outperforms several well-established task-specific models in both generation and perception tasks, surpassing flagship models such as SDXL and LLaVA-1.6, while eliminating the need for diffusion or compositional architectures. Emu3 is also capable of generating high-fidelity video via predicting the next token in a video sequence. We simplify complex multimodal model designs by converging on a singular focus: tokens, unlocking great potential for scaling both during training and inference. Our results demonstrate that next-token prediction is a promising path towards building general multimodal intelligence beyond language. We open-source key techniques and models to support further research in this direction.*
+
+Tips:
+
+- We advise users to set `processor.tokenizer.padding_side = "left"` before batched generation as it leads to more accurate results.
+
+- Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.
+
+- Emu3 has two different checkpoints for image-generation and text-generation, make sure to use the correct checkpoint when loading the model. To generate an image, it is advised to use `prefix_constraints` so that the generated tokens are sampled only from possible image tokens. See more below for usage examples.
+
+> [!TIP]
+> Emu3 implementation in Transformers uses a special image token to indicate where to merge image embeddings. The special image token isn't new and uses one of the reserved tokens: `<|extra_0|>`. You have to add `<image>` to your prompt in the place where the image should be embedded for correct generation.
+
+
+This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
+The original code can be found [here](https://github.com/baaivision/Emu3).
+
+
+## Usage example
+
+### Text generation inference
+
+Here's how to load the model and perform inference in half-precision (`torch.bfloat16`) to generate textual output from text or text and image inputs:
+
+```python
+from transformers import Emu3Processor, Emu3ForConditionalGeneration
+import torch
+from PIL import Image
+import requests
+
+processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda")
+
+# prepare image and text prompt
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = "What do you see in this image?<image>"
+
+inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+# autoregressively complete prompt
+output = model.generate(**inputs, max_new_tokens=50)
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+### Image generation inference
+
+Emu3 can also generate images from textual input. Here is how you can do it:
+
+```python
+processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Gen-hf")
+model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2")
+
+
+inputs = processor(
+    text=["a portrait of young girl. masterpiece, film grained, best quality.", "a dog running under the rain"],
+    padding=True,
+    return_tensors="pt",
+    return_for_image_generation=True,
+)
+inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16)
+
+neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
+neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0")
+
+image_sizes = inputs.pop("image_sizes")
+HEIGHT, WIDTH = image_sizes[0]
+VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
+
+def prefix_allowed_tokens_fn(batch_id, input_ids):
+    height, width = HEIGHT, WIDTH
+    visual_tokens = VISUAL_TOKENS
+    image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device)
+    eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device)
+    eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device)
+    pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device)
+    eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device)
+    eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
+
+    position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0]
+    offset = input_ids.shape[0] - position
+    if offset % (width + 1) == 0:
+        return (eol_token_id, )
+    elif offset == (width + 1) * height + 1:
+        return (eof_token_id, )
+    elif offset == (width + 1) * height + 2:
+        return (eoi_token_id, )
+    elif offset == (width + 1) * height + 3:
+        return (eos_token_id, )
+    elif offset > (width + 1) * height + 3:
+        return (pad_token_id, )
+    else:
+        return visual_tokens
+
+
+out = model.generate(
+    **inputs,
+    max_new_tokens=50_000, # make sure to have enough tokens for one image
+    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+    return_dict_in_generate=True,
+    negative_prompt_ids=neg_inputs.input_ids, # indicate for Classifier-Free Guidance
+    negative_prompt_attention_mask=neg_inputs.attention_mask,
+)
+
+image = model.decode_image_tokens(out.sequences[:, inputs.input_ids.shape[1]: ], height=HEIGHT, width=WIDTH)
+images = processor.postprocess(list(image.float()), return_tensors="PIL.Image.Image") # internally we convert to np but it's not supported in bf16 precision
+for i, image in enumerate(images['pixel_values']):
+    image.save(f"result{i}.png")
+
+```
+
+
+## Emu3Config
+
+[[autodoc]] Emu3Config
+
+## Emu3VQVAEConfig
+
+[[autodoc]] Emu3VQVAEConfig
+
+## Emu3TextConfig
+
+[[autodoc]] Emu3TextConfig
+
+## Emu3Processor
+
+[[autodoc]] Emu3Processor
+
+## Emu3ImageProcessor
+
+[[autodoc]] Emu3ImageProcessor
+    - preprocess
+
+## Emu3VQVAE
+
+[[autodoc]] Emu3VQVAE
+    - forward
+
+## Emu3TextModel
+
+[[autodoc]] Emu3TextModel
+    - forward
+
+## Emu3ForCausalLM
+
+[[autodoc]] Emu3ForCausalLM
+    - forward
+
+## Emu3ForConditionalGeneration
+
+[[autodoc]] Emu3ForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md
new file mode 100644
index 00000000000000..df5927544df960
--- /dev/null
+++ b/docs/source/en/model_doc/helium.md
@@ -0,0 +1,158 @@
+<!--Copyright 2024 Kyutai and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Helium
+
+
+## Overview
+
+Helium was proposed in [Announcing Helium-1 Preview](https://kyutai.org/2025/01/13/helium.html) by the Kyutai Team.
+
+
+Helium-1 preview is a lightweight language model with 2B parameters, targeting edge and mobile devices.
+It supports the following languages: English, French, German, Italian, Portuguese, Spanish.
+
+- **Developed by:** Kyutai
+- **Model type:** Large Language Model
+- **Language(s) (NLP):** English, French, German, Italian, Portuguese, Spanish
+- **License:** CC-BY 4.0
+
+
+
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA, 
+Physical Interaction QA, Social Interaction QA, HellaSwag, WinoGrande, Multilingual Knowledge QA, FLORES 200.
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+We report accuracy on MMLU, ARC, OBQA, CSQA, PIQA, SIQA, HellaSwag, WinoGrande.
+We report exact match on TriviaQA, NQ and MKQA.
+We report BLEU on FLORES.
+
+### English Results
+
+| Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) |
+|--------------|--------|--------|--------|--------|--------|
+| | | | | | |
+| MMLU | 51.2 | 50.4 | 53.1 | 56.6 | 61.0 |
+| NQ   | 17.3 | 15.1 | 17.7 | 22.0 | 13.1 |
+| TQA  | 47.9 | 45.4 | 49.9 | 53.6 | 35.9 |
+| ARC E | 80.9 | 81.8 | 81.1 | 84.6 | 89.7 |
+| ARC C | 62.7 | 64.7 | 66.0 | 69.0 | 77.2 |
+| OBQA | 63.8 | 61.4 | 64.6 | 68.4 | 73.8 |
+| CSQA | 65.6 | 59.0 | 64.4 | 65.4 | 72.4 |
+| PIQA | 77.4 | 77.7 | 79.8 | 78.9 | 76.0 |
+| SIQA | 64.4 | 57.5 | 61.9 | 63.8 | 68.7 |
+| HS | 69.7 | 73.2 | 74.7 | 76.9 | 67.5 |
+| WG | 66.5 | 65.6 | 71.2 | 72.0 | 64.8 |
+| | | | | | |
+| Average | 60.7 | 59.3 | 62.2 | 64.7 | 63.6 |
+
+#### Multilingual Results
+
+| Language | Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) |
+|-----|--------------|--------|--------|--------|--------|--------|
+| | | | | | | |
+|German| MMLU | 45.6 | 35.3 | 45.0 | 47.5 | 49.5 |
+|| ARC C | 56.7 | 38.4 | 54.7 | 58.3 | 60.2 |
+|| HS | 53.5 | 33.9 | 53.4 | 53.7 | 42.8 |
+|| MKQA | 16.1 | 7.1 | 18.9 | 20.2 | 10.4 |
+| | | | | | | |
+|Spanish| MMLU | 46.5 | 38.9 | 46.2 | 49.6 | 52.8 |
+|| ARC C | 58.3 | 43.2 | 58.8 | 60.0 | 68.1 |
+|| HS | 58.6 | 40.8 | 60.5 | 61.1 | 51.4 |
+|| MKQA | 16.0 | 7.9 | 18.5 | 20.6 | 10.6 |
+
+
+## Technical Specifications
+
+### Model Architecture and Objective
+
+| Hyperparameter | Value |
+|--------------|--------|
+| Layers | 24 |
+| Heads  | 20 |
+| Model dimension | 2560 |
+| MLP dimension | 7040 |
+| Context size | 4096 |
+| Theta RoPE | 100,000 |
+
+Tips:
+
+- This model was contributed by [Laurent Mazare](https://huggingface.co/lmz)
+
+  
+## Usage tips
+
+`Helium` can be found on the [Huggingface Hub](https://huggingface.co/collections/kyutai/helium-1-preview)
+
+In the following, we demonstrate how to use `helium-1-preview` for the inference. 
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("helium-1-preview", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("helium-1-preview")
+
+>>> prompt = "Give me a short introduction to large language model."
+
+>>> messages = [{"role": "user", "content": prompt}]
+
+>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+>>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
+
+>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+
+>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+## HeliumConfig
+
+[[autodoc]] HeliumConfig
+
+## HeliumModel
+
+[[autodoc]] HeliumModel
+    - forward
+
+## HeliumForCausalLM
+
+[[autodoc]] HeliumForCausalLM
+    - forward
+
+## HeliumForSequenceClassification
+
+[[autodoc]] HeliumForSequenceClassification
+    - forward
+
+## HeliumForTokenClassification
+
+[[autodoc]] HeliumForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md
index b641d7f3f58199..e90f34a903e49b 100644
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# ModernBert
+# ModernBERT
 
 <div class="flex flex-wrap space-x-1">
 <a href="https://huggingface.co/models?filter=modernbert">
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The ModernBert model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli.
+The ModernBERT model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli.
 
 It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). 
 
diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
new file mode 100644
index 00000000000000..571e3febdb4fa8
--- /dev/null
+++ b/docs/source/en/model_doc/moonshine.md
@@ -0,0 +1,56 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Moonshine
+
+## Overview
+
+The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
+](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
+
+The abstract from the paper is the following:
+
+*This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.*
+
+Tips:
+
+- Moonshine improves upon Whisper's architecture:
+  1. It uses SwiGLU activation instead of GELU in the decoder layers
+  2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
+
+This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/usefulsensors/moonshine).
+
+## Resources
+
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## MoonshineConfig
+
+[[autodoc]] MoonshineConfig
+
+## MoonshineModel
+
+[[autodoc]] MoonshineModel
+    - forward
+    - _mask_input_features
+
+## MoonshineForConditionalGeneration
+
+[[autodoc]] MoonshineForConditionalGeneration
+    - forward
+    - generate
+
diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md
index 4d92d861f0bb5f..7b67713c42b743 100644
--- a/docs/source/en/model_doc/musicgen_melody.md
+++ b/docs/source/en/model_doc/musicgen_melody.md
@@ -266,7 +266,6 @@ Tips:
 ## MusicgenMelodyFeatureExtractor
 
 [[autodoc]] MusicgenMelodyFeatureExtractor
-    - _extract_stem_indices
 
 ## MusicgenMelodyConfig
 
diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md
index f399a7e7320c17..2ef947ce430d40 100644
--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@@ -34,6 +34,37 @@ The abstract from the paper is the following:
 
 `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
 
+### Inference
+
+```python
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True, device_map="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True)
+
+prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
+url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
+audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
+inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+
+generate_ids = model.generate(**inputs, max_length=256)
+generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+# We can also omit the audio_bos and audio_eos tokens
+prompt = "<|AUDIO|>Generate the caption in English:"
+inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+
+generate_ids = model.generate(**inputs, max_length=256)
+generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+```
+
 In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 
 ### Voice Chat Inference
diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md
index 88e38cbb590edc..0c0977d10b580c 100644
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@@ -102,7 +102,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP.
 
-- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification_md)
+- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
 - Demo notebooks for SigLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP). 🌎
 
 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
new file mode 100644
index 00000000000000..d6b431e648f21b
--- /dev/null
+++ b/docs/source/en/model_doc/textnet.md
@@ -0,0 +1,55 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TextNet
+
+## Overview
+
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/fast_architecture.png"
+alt="drawing" width="600"/>
+
+<small> TextNet backbone as part of FAST. Taken from the <a href="https://arxiv.org/abs/2111.02394">original paper.</a> </small>
+
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
+
+## Usage tips
+
+TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. 
+Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines.
+
+TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights
+
+## TextNetConfig
+
+[[autodoc]] TextNetConfig
+
+## TextNetImageProcessor
+
+[[autodoc]] TextNetImageProcessor
+    - preprocess
+
+## TextNetModel
+
+[[autodoc]] TextNetModel
+    - forward
+
+## TextNetForImageClassification
+
+[[autodoc]] TextNetForImageClassification
+    - forward
+
diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
new file mode 100644
index 00000000000000..361f8e30c75d2b
--- /dev/null
+++ b/docs/source/en/model_doc/vitpose.md
@@ -0,0 +1,254 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VitPose
+
+## Overview
+
+The VitPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. VitPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
+
+The abstract from the paper is the following:
+
+*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
+
+![vitpose-architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png)
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
+The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
+
+## Usage Tips
+
+ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
+
+```py
+import torch
+import requests
+import numpy as np
+
+from PIL import Image
+
+from transformers import (
+    AutoProcessor,
+    RTDetrForObjectDetection,
+    VitPoseForPoseEstimation,
+)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+# ------------------------------------------------------------------------
+# Stage 1. Detect humans on the image
+# ------------------------------------------------------------------------
+
+# You can choose detector by your choice
+person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
+
+inputs = person_image_processor(images=image, return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = person_model(**inputs)
+
+results = person_image_processor.post_process_object_detection(
+    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
+)
+result = results[0]  # take first image results
+
+# Human label refers 0 index in COCO dataset
+person_boxes = result["boxes"][result["labels"] == 0]
+person_boxes = person_boxes.cpu().numpy()
+
+# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
+person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+
+# ------------------------------------------------------------------------
+# Stage 2. Detect keypoints for each person found
+# ------------------------------------------------------------------------
+
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)
+
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
+image_pose_result = pose_results[0]  # results for first image
+```
+
+
+### Visualization for supervision user
+```py
+import supervision as sv
+
+xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
+scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
+
+key_points = sv.KeyPoints(
+    xy=xy, confidence=scores
+)
+
+edge_annotator = sv.EdgeAnnotator(
+    color=sv.Color.GREEN,
+    thickness=1
+)
+vertex_annotator = sv.VertexAnnotator(
+    color=sv.Color.RED,
+    radius=2
+)
+annotated_frame = edge_annotator.annotate(
+    scene=image.copy(),
+    key_points=key_points
+)
+annotated_frame = vertex_annotator.annotate(
+    scene=annotated_frame,
+    key_points=key_points
+)
+```
+
+### Visualization for advanced user
+```py
+import math
+import cv2
+
+def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+    if pose_keypoint_color is not None:
+        assert len(pose_keypoint_color) == len(keypoints)
+    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+        x_coord, y_coord = int(kpt[0]), int(kpt[1])
+        if kpt_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in pose_keypoint_color[kid])
+            if show_keypoint_weight:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+            else:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+
+def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+    height, width, _ = image.shape
+    if keypoint_edges is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_edges)
+        for sk_id, sk in enumerate(keypoint_edges):
+            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
+            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
+            if (
+                x1 > 0
+                and x1 < width
+                and y1 > 0
+                and y1 < height
+                and x2 > 0
+                and x2 < width
+                and y2 > 0
+                and y2 < height
+                and score1 > keypoint_score_threshold
+                and score2 > keypoint_score_threshold
+            ):
+                color = tuple(int(c) for c in link_colors[sk_id])
+                if show_keypoint_weight:
+                    X = (x1, x2)
+                    Y = (y1, y2)
+                    mean_x = np.mean(X)
+                    mean_y = np.mean(Y)
+                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    polygon = cv2.ellipse2Poly(
+                        (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
+                    )
+                    cv2.fillConvexPoly(image, polygon, color)
+                    transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
+                    cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+                else:
+                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)
+
+
+# Note: keypoint_edges and color palette are dataset-specific
+keypoint_edges = model.config.edges
+
+palette = np.array(
+    [
+        [255, 128, 0],
+        [255, 153, 51],
+        [255, 178, 102],
+        [230, 230, 0],
+        [255, 153, 255],
+        [153, 204, 255],
+        [255, 102, 255],
+        [255, 51, 255],
+        [102, 178, 255],
+        [51, 153, 255],
+        [255, 153, 153],
+        [255, 102, 102],
+        [255, 51, 51],
+        [153, 255, 153],
+        [102, 255, 102],
+        [51, 255, 51],
+        [0, 255, 0],
+        [0, 0, 255],
+        [255, 0, 0],
+        [255, 255, 255],
+    ]
+)
+
+link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
+
+numpy_image = np.array(image)
+
+for pose_result in image_pose_result:
+    scores = np.array(pose_result["scores"])
+    keypoints = np.array(pose_result["keypoints"])
+
+    # draw each point on image
+    draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+
+    # draw links
+    draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
+
+pose_image = Image.fromarray(numpy_image)
+pose_image
+```
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
+
+### MoE backbone
+
+To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model.  However, it is not used in default parameters. Below is the code snippet for usage of MoE function.
+
+```py
+>>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+>>> import torch
+
+>>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1])
+>>> model = VitPoseBackbone(config)
+
+>>> pixel_values = torch.randn(3, 3, 256, 192)
+>>> dataset_index = torch.tensor([1, 2, 3])
+>>> outputs = model(pixel_values, dataset_index)
+```
+
+## VitPoseImageProcessor
+
+[[autodoc]] VitPoseImageProcessor
+    - preprocess
+
+## VitPoseConfig
+
+[[autodoc]] VitPoseConfig
+
+## VitPoseForPoseEstimation
+
+[[autodoc]] VitPoseForPoseEstimation
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 364141c8e406b2..d9bdf6f6e4841d 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -47,7 +47,9 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model)
 * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
+* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
+* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
@@ -67,6 +69,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
 * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
 * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
@@ -106,6 +109,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
+* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
 
 You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
 
@@ -237,11 +241,13 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
+* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
 * [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel)
+* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
@@ -263,6 +269,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
 * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration)
@@ -281,8 +288,8 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
 * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
-* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
@@ -318,6 +325,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel)
 * [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
 * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)
+* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
 
 <Tip>
 
diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md
index 5713ef4132a9a8..1534a977f3436f 100644
--- a/docs/source/en/quantization/gptq.md
+++ b/docs/source/en/quantization/gptq.md
@@ -22,15 +22,42 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google
 
 </Tip>
 
-The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
+Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth.
 
-Before you begin, make sure the following libraries are installed:
+[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences.
+
+* Model support: GPTQModel continues to support all of the latest LLM models.
+* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. 
+* Platform support: Linux, macOS (Apple Silicon), and Windows 11.
+* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs.
+* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization.
+* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support.
+* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere).
+* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. 
+* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs.
+* User and developer friendly APIs. 
+
+
+[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. 
+
+Before you begin, make sure the following libraries are installed and updated to the latest release:
 
 ```bash
-pip install auto-gptq
 pip install --upgrade accelerate optimum transformers
 ```
 
+Then install either GPTQModel or AutoGPTQ.
+
+```bash
+pip install gptqmodel --no-build-isolation
+```
+
+or
+
+```bash
+pip install auto-gptq --no-build-isolation
+```
+
 To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
 
 ```py
@@ -92,9 +119,22 @@ from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
 ```
 
+## Marlin
+
+[Marlin](https://github.com/IST-DASLab/marlin) is a 4-bit only CUDA GPTQ kernel, highly optimized for the NVIDIA A100 GPU (Ampere) architecture. Loading, dequantization, and execution of post-dequantized weights are highly parallelized, offering a substantial inference improvement versus the original CUDA GPTQ kernel. Marlin is only available for quantized inference and does not support model quantization.
+
+Marlin inference can be activated with the `backend` parameter in [`GPTQConfig`].
+
+```py
+
+from transformers import AutoModelForCausalLM, GPTQConfig
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin"))
+```
+
 ## ExLlama
 
-[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
+[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
 
 ```py
 import torch
@@ -110,11 +150,11 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne
 
 </Tip>
 
-The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
 
 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig
 gptq_config = GPTQConfig(bits=4, use_exllama=False)
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 48840fad646fd0..dfe680832b1952 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -45,32 +45,50 @@ In short, supporting a wide range of quantization methods allows you to pick the
 
 Use the table below to help you decide which quantization method to use.
 
-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
-|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🔴         | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq) | 🔴                       | 🟢   | 🟢        | 🟢              | 🔴                     | 🟢         | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🟡 *       | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HIGGS](./higgs)                             | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 2 - 4          | 🔴                                   | 🟢            | 🟢                      | https://github.com/HanGuo97/flute           |       
-| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
-| [VPTQ](./vptq)                      | 🔴                       |  🔴   |     🟢     | 🟡              | 🔴      | 🔴                | 🟢                      | 1 - 8          | 🔴                                   | 🟢            | 🟢                      | https://github.com/microsoft/VPTQ            |
+| Quantization Method                           | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits          | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
+|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
+| [AQLM](./aqlm.md)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq.md)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes.md)             | 🟢                   | 🟡 <sub>1</sub> |     🟢     | 🟡 <sub>1</sub> | 🔴 <sub>2</sub>                    | 🟡 <sub>1</sub> | 🔴 <sub>1</sub> | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors.md) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq.md)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [GGUF / GGML (llama.cpp)](../gguf.md)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf.md)     | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp      |
+| [GPTQModel](./gptq.md)                        | 🔴                   | 🟢 <sub>3</sub> | 🟢        | 🟢        | 🟢                                 | 🟢 <sub>4</sub> | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
+| [AutoGPTQ](./gptq.md)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs.md)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
+| [HQQ](./hqq.md)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto.md)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao.md)                       | 🟢                   |                 | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq.md)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
 
 <Tip>
+  
+**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
 
-\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+</Tip>
+
+<Tip>
+
+**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+
+</Tip>
+
+<Tip>
 
-We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon.
 
 </Tip>
 
 <Tip>
 
-\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs.
+
+</Tip>
+
+<Tip>
+
+**5:** torchao only supports int4 weight on Metal (Apple Silicon).
+
+</Tip>
 
-</Tip>
\ No newline at end of file
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
index 38f7c074c97d90..46fb0f8cbb9a88 100644
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -16,7 +16,8 @@ rendered properly in your Markdown viewer.
 Before you begin, make sure the following libraries are installed with their latest version:
 
 ```bash
-pip install --upgrade torch torchao
+# Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation
+pip install --upgrade torch torchao transformers
 ```
 
 By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
@@ -35,12 +36,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 
-# compile the quantized model to get speedup
-import torchao
-torchao.quantization.utils.recommended_inductor_config_setter()
-quantized_model = torch.compile(quantized_model, mode="max-autotune")
-
-output = quantized_model.generate(**input_ids, max_new_tokens=10)
+# auto-compile the quantized model with `cache_implementation="static"` to get speedup
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 # benchmark the performance
@@ -59,11 +56,11 @@ def benchmark_fn(f, *args, **kwargs):
     return f"{(t0.blocked_autorange().mean):.3f}"
 
 MAX_NEW_TOKENS = 1000
-print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS))
+print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
 bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
-bf16_model = torch.compile(bf16_model, mode="max-autotune")
-print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS))
+output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
+print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
 ```
 
diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md
index 3929f7994bdafb..4c10907e4571bf 100644
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@@ -144,4 +144,4 @@ print(processor.decode(output[0][2:], skip_special_tokens=True)[len(user_prompt)
 
 And voila! 
 
-To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../image_text_to_text) task guide because these models work similarly.
\ No newline at end of file
+To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../tasks/image_text_to_text) task guide because these models work similarly.
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/decision_transformer.md b/docs/source/ja/model_doc/decision_transformer.md
index 9c7f27bbeeec2d..fe37feb5a35d54 100644
--- a/docs/source/ja/model_doc/decision_transformer.md
+++ b/docs/source/ja/model_doc/decision_transformer.md
@@ -23,31 +23,28 @@ Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laski
 
 論文の要約は次のとおりです。
 
-*強化学習（RL）をシーケンスモデリング問題として抽象化するフレームワークを紹介します。
+_強化学習（RL）をシーケンスモデリング問題として抽象化するフレームワークを紹介します。
 これにより、Transformer アーキテクチャのシンプルさとスケーラビリティ、および関連する進歩を活用できるようになります。
- GPT-x や BERT などの言語モデリングで。特に、Decision Transformer というアーキテクチャを紹介します。
- RL の問題を条件付きシーケンス モデリングとして投げかけます。値関数に適合する以前の RL アプローチとは異なり、
- ポリシー勾配を計算すると、Decision Transformer は因果的にマスクされたアルゴリズムを利用して最適なアクションを出力するだけです。
- 変成器。望ましいリターン (報酬)、過去の状態、アクションに基づいて自己回帰モデルを条件付けすることにより、
- Decision Transformer モデルは、望ましいリターンを達成する将来のアクションを生成できます。そのシンプルさにも関わらず、
- Decision Transformer は、最先端のモデルフリーのオフライン RL ベースラインのパフォーマンスと同等、またはそれを超えています。
- Atari、OpenAI Gym、Key-to-Door タスク*
+GPT-x や BERT などの言語モデリングで。特に、Decision Transformer というアーキテクチャを紹介します。
+RL の問題を条件付きシーケンス モデリングとして投げかけます。値関数に適合する以前の RL アプローチとは異なり、
+ポリシー勾配を計算すると、Decision Transformer は因果的にマスクされたアルゴリズムを利用して最適なアクションを出力するだけです。
+変成器。望ましいリターン (報酬)、過去の状態、アクションに基づいて自己回帰モデルを条件付けすることにより、
+Decision Transformer モデルは、望ましいリターンを達成する将来のアクションを生成できます。そのシンプルさにも関わらず、
+Decision Transformer は、最先端のモデルフリーのオフライン RL ベースラインのパフォーマンスと同等、またはそれを超えています。
+Atari、OpenAI Gym、Key-to-Door タスク_
 
 このバージョンのモデルは、状態がベクトルであるタスク用です。
 
-このモデルは、[edbeeching](https://huggingface.co/edbeeching) によって提供されました。元のコードは [ここ](https://github.com/kzl/decion-transformer) にあります。
+このモデルは、[edbeeching](https://huggingface.co/edbeeching) によって提供されました。元のコードは [ここ](https://github.com/kzl/decision-transformer) にあります。
 
 ## DecisionTransformerConfig
 
 [[autodoc]] DecisionTransformerConfig
 
-
 ## DecisionTransformerGPT2Model
 
-[[autodoc]] DecisionTransformerGPT2Model
-    - forward
+[[autodoc]] DecisionTransformerGPT2Model - forward
 
 ## DecisionTransformerModel
 
-[[autodoc]] DecisionTransformerModel
-    - forward
+[[autodoc]] DecisionTransformerModel - forward
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 54740610ee1148..dfa50f9f13a1f1 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -145,38 +145,14 @@
     title: (번역중) Getting started
   - local: quantization/bitsandbytes
     title: bitsandbytes
-  - local: in_translation
-    title: (번역중) GPTQ
+  - local: quantization/gptq
+    title: GPTQ
   - local: quantization/awq
     title: AWQ
   - local: in_translation
     title: (번역중) AQLM
   - local: in_translation
     title: (번역중) VPTQ 
-  - local: in_translation
-    title: (번역중) Quanto
-  - local: in_translation
-    title: (번역중) EETQ
-  - local: in_translation
-    title: (번역중) HQQ
-  - local: in_translation
-    title: (번역중) Optimum
-  - local: in_translation
-    title: (번역중) Contribute new quantization method
-  title: (번역중) 경량화 메소드
-- sections:
-  - local: in_translation
-    title: (번역중) Getting started
-  - local: in_translation
-    title: (번역중) bitsandbytes
-  - local: quantization/gptq
-    title: GPTQ
-  - local: in_translation
-    title: (번역중) AWQ
-  - local: in_translation
-    title: (번역중) AQLM
-  - local: in_translation
-    title: (번역중) VPTQ
   - local: quantization/quanto
     title: Quanto
   - local: quantization/eetq
@@ -692,8 +668,8 @@
       sections:
       - local: in_translation
         title: (번역중) ALIGN
-      - local: in_translation
-        title: (번역중) AltCLIP
+      - local: model_doc/altclip
+        title: AltCLIP
       - local: model_doc/blip-2
         title: BLIP-2
       - local: model_doc/blip
diff --git a/docs/source/ko/model_doc/altclip.md b/docs/source/ko/model_doc/altclip.md
new file mode 100644
index 00000000000000..1236bcc9aaa7da
--- /dev/null
+++ b/docs/source/ko/model_doc/altclip.md
@@ -0,0 +1,78 @@
+# AltCLIP
+
+## 개요[[overview]]
+
+AltCLIP 모델은 Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu의 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) 논문에서 제안되었습니다. AltCLIP(CLIP의 언어 인코더를 변경하여 언어 기능 확장)은 다양한 이미지-텍스트 및 텍스트-텍스트 쌍으로 훈련된 신경망입니다. CLIP의 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하여, 거의 모든 작업에서 CLIP과 유사한 성능을 얻을 수 있었으며, 원래 CLIP의 다국어 이해와 같은 기능도 확장되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*본 연구에서는 강력한 이중 언어 멀티모달 표현 모델을 훈련하는 개념적으로 간단하고 효과적인 방법을 제시합니다. OpenAI에서 출시한 사전 훈련된 멀티모달 표현 모델 CLIP에서 시작하여, 그 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하고, 교사 학습과 대조 학습으로 구성된 2단계 훈련 스키마를 통해 언어와 이미지 표현을 정렬했습니다. 우리는 광범위한 작업 평가를 통해 우리의 방법을 검증했습니다. ImageNet-CN, Flicker30k-CN, COCO-CN을 포함한 여러 작업에서 새로운 최고 성능을 달성했으며, 거의 모든 작업에서 CLIP과 유사한 성능을 얻었습니다. 이는 CLIP의 텍스트 인코더를 단순히 변경하여 다국어 이해와 같은 확장 기능을 얻을 수 있음을 시사합니다.*
+
+이 모델은 [jongjyh](https://huggingface.co/jongjyh)에 의해 기여되었습니다.
+
+## 사용 팁과 예제[[usage-tips-and-example]]
+
+AltCLIP의 사용법은 CLIP과 매우 유사하며, 차이점은 텍스트 인코더에 있습니다. 일반적인 어텐션 대신 양방향 어텐션을 사용하며, XLM-R의 [CLS] 토큰을 사용하여 텍스트 임베딩을 나타냅니다.
+
+AltCLIP은 멀티모달 비전 및 언어 모델입니다. 이미지와 텍스트 간의 유사성 계산 및 제로샷 이미지 분류에 사용할 수 있습니다. AltCLIP은 ViT와 같은 트랜스포머를 사용하여 시각적 특징을 얻고, 양방향 언어 모델을 사용하여 텍스트 특징을 얻습니다. 이후 텍스트와 시각적 특징 모두 동일한 차원의 잠재 공간으로 투사됩니다. 투사된 이미지와 텍스트 특징 간의 내적을 유사도 점수로 사용합니다.
+
+이미지를 트랜스포머 인코더에 입력하기 위해, 각 이미지를 일정한 크기의 겹치지 않는 패치 시퀀스로 분할한 뒤, 이를 선형 임베딩합니다. 전체 이미지를 나타내기 위해 [CLS] 토큰이 추가됩니다. 저자들은 절대 위치 임베딩도 추가하여 결과 벡터 시퀀스를 표준 트랜스포머 인코더에 입력합니다. [`CLIPImageProcessor`]는 모델을 위해 이미지를 크기 조정하고 정규화하는 데 사용할 수 있습니다.
+
+[`AltCLIPProcessor`]는 [`CLIPImageProcessor`]와 [`XLMRobertaTokenizer`]를 하나의 인스턴스로 묶어 텍스트를 인코딩하고 이미지를 준비합니다. 다음 예제는 [`AltCLIPProcessor`]와 [`AltCLIPModel`]을 사용하여 이미지와 텍스트 간의 유사성 점수를 얻는 방법을 보여줍니다.
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import AltCLIPModel, AltCLIPProcessor
+
+>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
+>>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # 이미지-텍스트 유사도 점수
+>>> probs = logits_per_image.softmax(dim=1)  # 라벨 마다 확률을 얻기 위해 softmax 적용
+```
+<Tip>
+
+이 모델은 `CLIPModel`을 기반으로 하므로, 원래 CLIP처럼 사용할 수 있습니다.
+
+</Tip>
+
+## AltCLIPConfig
+
+[[autodoc]] AltCLIPConfig
+    - from_text_vision_configs
+
+## AltCLIPTextConfig
+
+[[autodoc]] AltCLIPTextConfig
+
+## AltCLIPVisionConfig
+
+[[autodoc]] AltCLIPVisionConfig
+
+## AltCLIPProcessor
+
+[[autodoc]] AltCLIPProcessor
+
+## AltCLIPModel
+
+[[autodoc]] AltCLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AltCLIPTextModel
+
+[[autodoc]] AltCLIPTextModel
+    - forward
+
+## AltCLIPVisionModel
+
+[[autodoc]] AltCLIPVisionModel
+    - forward
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index ee155e377e4137..692a43f9d23cdd 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index 095af99efffccc..ae106c7264317b 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
 
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index ddbde78f703ce2..0c4edda3bd4afb 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -56,7 +56,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 5f1988c36de17c..bc83fb53e2120e 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -57,7 +57,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py
index 7042c586cbb636..59637e02d3f11c 100644
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -43,7 +43,7 @@ class MyNewModelConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens,
-            MyNewModel 2 up to 4096, CodeMyNewModel up to 16384.
+            MyNewModel 2 up to 4096, CodeLlama up to 16384.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@@ -110,7 +110,7 @@ class MyNewModelConfig(PretrainedConfig):
         mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
         head_dim (`int`, *optional*):
-            The attention head dimension. If None, it will default to hidden_size // num_heads
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
 
     ```python
     >>> from transformers import MyNewModelModel, MyNewModelConfig
diff --git a/examples/modular-transformers/image_processing_new_imgproc_model.py b/examples/modular-transformers/image_processing_new_imgproc_model.py
index a64eb17861a1c2..f3ab1772ec59e9 100644
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@@ -247,7 +247,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py
index 3e0aa6e9b2ad02..382b87bd38471e 100644
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@@ -597,7 +597,7 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
 
diff --git a/examples/modular-transformers/modeling_multimodal1.py b/examples/modular-transformers/modeling_multimodal1.py
index c4f90a5cbadab3..df23a83b341144 100644
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@@ -597,7 +597,7 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
 
diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py
index b8d5b5eb910095..9288b1a2930578 100644
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -602,7 +602,7 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
 
diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
index 477d084b1d9309..da0b354fe76efa 100644
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -249,9 +249,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def _update_causal_mask(
         self,
         attention_mask,
diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py
index 42d8108ee72a68..1f5aa55c46909e 100644
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@@ -519,7 +519,7 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
+            if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index ef308316569b79..d9cb0187e4d82f 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index d42fb52d5c13c0..f8170bb416dd8c 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 111d8adce8b41a..32d85b7d98dab1 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -57,7 +57,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 6cbcac0a7e688a..0dc3e11f08abc7 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index f23e55191709a5..6308e250f5c341 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 9d052076b7b162..3721dc267c2273 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 100a1365c2e9ea..0d8496bb3fed15 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
index 806330fb72d107..306e8085f6763f 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -46,7 +46,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
index d888b7853dd475..e9cd01610bb7b3 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -52,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 10bfee8f25f775..54098f5a7dd617 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 078b0add065ceb..e35e4e7d907ecb 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index cac845f3a055df..e94690eaa7fcd3 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -58,7 +58,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 0a0e10511fa236..64e340a62a7639 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 8cb30099491a3a..fde2980d3ab5b8 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 0bff38707d567a..40265efcfdae29 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 20763558a5f626..6ef17ebb9b6af8 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index f188e4e476a208..2d927f41978824 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 2d4e8bdbb92c06..9ec972d091c74b 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index 07fcb36acb1583..8d722f4d5d5ddb 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
 
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index 33ad0499301e18..dbfcb3fd97fae8 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 6de464f4367072..81fcc7b8b70b96 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index c3e12ac9edef16..4b199a9e8990ba 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 8e791564b007bf..312d8b389dd610 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 6ccce481b548a9..da448c37f2c4d6 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index c1874b3fe18e12..cd6204c467c2a2 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 6f77f82564172d..0551b5f61f10c5 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 16e64eb92343f1..8a0d9de748f9ec 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 9eb3498c8c174e..70fc035fabb845 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index 682d3b16d216a3..a6e3d9f7f33fc0 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index e6a643e4213910..6d3950802c83ef 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index a2d09f20004704..93036a7e03ac06 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index ab5ab7adb19c44..c2201840cacc18 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index dae845b119b172..fef77f4108a39d 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 2a99bc42e1195a..9f1c93f3df2edb 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 8da7e86d87551f..61bd746f0782e4 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index c76f83ce4def6f..0c7d2c44b8788e 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 056db7167280ed..81340dc2eef8ad 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 56e3a1e646db34..5b8c4c80ef8940 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index dadfcb80941e27..1d8ff8d05e530a 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index df4c1e9557a982..50213214304692 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 33bb1d658595e5..b8ee442d9bfeae 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -92,7 +92,7 @@ itsdangerous==2.1.1
 jax==0.3.4
 jaxlib==0.3.2
 jedi==0.18.1
-Jinja2==3.1.4
+Jinja2==3.1.5
 jinja2-time==0.2.0
 jmespath==0.10.0
 joblib==1.2.0
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 01c31de8730b12..bc656ba6ff1c2c 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 296c70549bda1b..6e4be8dcb0565a 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index b35d761d8a6a6a..02b8e565240166 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index a78a5d89e19f6c..46f0470d1c5686 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -62,7 +62,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 92a10990d160ed..3fd823ec8c08b6 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index a8f2de825cc2e4..0fbe3790e0589c 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 1afb72cf10984e..148ee55f26c3dd 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.48.0.dev0")
+check_min_version("4.49.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/read_video.py b/read_video.py
new file mode 100644
index 00000000000000..25e201a6e48d83
--- /dev/null
+++ b/read_video.py
@@ -0,0 +1,77 @@
+import numpy as np
+import cv2
+import requests
+from yt_dlp import YoutubeDL
+from contextlib import redirect_stdout
+from pathlib import Path
+import io
+import imageio.v3 as iio
+
+
+url = "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
+vid = cv2.VideoCapture(url)
+# ret, frame = vid.read()
+
+while(True):
+    # Capture frame-by-frame
+    ret, frame = vid.read()
+    #print cap.isOpened(), ret
+    if frame is not None:
+        pass
+        # print(frame.shape)
+    else:
+        break
+
+print(vid.isOpened(), frame is not None)
+
+buffer = io.BytesIO(requests.get(url).content)
+video = buffer.getvalue()
+frames = iio.imread(video, index=None)
+print(frames.shape)
+
+
+
+
+
+youtube_id = "https://www.youtube.com/watch?v=BaW_jenozKc"
+
+ctx = {
+    "outtmpl": "-",
+    'logtostderr': True
+}
+
+buffer = io.BytesIO()
+with redirect_stdout(buffer), YoutubeDL(ctx) as foo:
+    foo.download([youtube_id])
+# Path(f"vi.mp4").write_bytes(buffer.getvalue())
+
+video = buffer.getvalue()
+print(type(video))
+frames = iio.imread(video, index=None)
+print(frames.shape)
+
+
+import decord
+file_obj = io.BytesIO(video)
+container = decord.VideoReader(file_obj)
+print(container[2].shape)
+
+# print(np.frombuffer(video, dtype=np.uint8).shape)
+# img_array = np.asarray(bytearray(video), dtype=np.uint8)
+# im = cv2.imdecode(img_array, cv2.IMREAD_UNCHANGED)
+
+
+
+import av
+
+file_obj = io.BytesIO(video)
+container = av.open(file_obj)
+container.seek(0)
+frames = []
+for i, frame in enumerate(container.decode(video=0)):
+    if i > 10:
+        break
+    if i >= 0:
+        frames.append(frame)
+out = np.stack([x.to_ndarray(format="rgb24") for x in frames])
+print(out.shape)
diff --git a/run.py b/run.py
new file mode 100644
index 00000000000000..b79ba1ecf3fb9f
--- /dev/null
+++ b/run.py
@@ -0,0 +1,107 @@
+import av
+import torch
+import decord
+from decord import VideoReader, cpu
+
+import numpy as np
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, SiglipImageProcessor
+
+model_id = "/raid/raushan/llava-next-video-qwen-7b"
+
+model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+    model_id, 
+    torch_dtype=torch.bfloat16, 
+    low_cpu_mem_usage=True, 
+).to(0)
+
+processor = LlavaNextVideoProcessor.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+img_proc = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+
+image = Image.open("/raid/raushan/image.png")
+
+
+def load_video(video_path, max_frames_num,fps=1,force_sample=False):
+
+    vr = VideoReader(video_path)
+    total_frame_num = len(vr)
+    video_time = total_frame_num / vr.get_avg_fps()
+    fps = round(vr.get_avg_fps()/fps)
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    frame_time = [i/fps for i in frame_idx]
+    if len(frame_idx) > max_frames_num or force_sample:
+        sample_fps = max_frames_num
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    print(spare_frames.shape)
+    return spare_frames,frame_time,video_time
+
+
+def read_video_pyav(container, indices):
+    '''
+    Decode the video with PyAV decoder.
+    Args:
+        container (`av.container.input.InputContainer`): PyAV container.
+        indices (`List[int]`): List of frame indices to decode.
+    Returns:
+        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+    '''
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+# define a chat history and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image", "video") 
+# <|im_start|>system
+# You are a helpful assistant.<|im_end|>
+# <|im_start|>user
+# <image>Time farmes are this moments and we ahev 64 frames
+# Please describe this video in detail.<|im_end|>
+# <|im_start|>assistant
+
+conversation = [
+    {
+
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful assistant."},
+            ],
+    },
+    {
+
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "The video lasts for 19.97 seconds, and 64 frames are uniformly sampled from it. These frames are located at 0.00s,0.30s,0.60s,0.93s,1.23s,1.57s,1.87s,2.20s,2.50s,2.83s,3.13s,3.47s,3.77s,4.10s,4.40s,4.73s,5.03s,5.37s,5.67s,6.00s,6.30s,6.63s,6.93s,7.27s,7.57s,7.90s,8.20s,8.53s,8.83s,9.17s,9.47s,9.80s,10.10s,10.43s,10.73s,11.07s,11.37s,11.70s,12.00s,12.33s,12.63s,12.97s,13.27s,13.60s,13.90s,14.23s,14.53s,14.87s,15.17s,15.50s,15.80s,16.13s,16.43s,16.77s,17.07s,17.40s,17.70s,18.03s,18.33s,18.67s,18.97s,19.30s,19.60s,19.93s.Please answer the following questions related to this video.\nPlease describe this video in detail."},
+            {"type": "video"},
+            ],
+    },
+]
+
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video>The video lasts for 19.97 seconds, and 64 frames are uniformly sampled from it. These frames are located at 0.00s,0.30s,0.60s,0.93s,1.23s,1.57s,1.87s,2.20s,2.50s,2.83s,3.13s,3.47s,3.77s,4.10s,4.40s,4.73s,5.03s,5.37s,5.67s,6.00s,6.30s,6.63s,6.93s,7.27s,7.57s,7.90s,8.20s,8.53s,8.83s,9.17s,9.47s,9.80s,10.10s,10.43s,10.73s,11.07s,11.37s,11.70s,12.00s,12.33s,12.63s,12.97s,13.27s,13.60s,13.90s,14.23s,14.53s,14.87s,15.17s,15.50s,15.80s,16.13s,16.43s,16.77s,17.07s,17.40s,17.70s,18.03s,18.33s,18.67s,18.97s,19.30s,19.60s,19.93s.Please answer the following questions related to this video.\nPlease describe this video in detail.<|im_end|>\n<|im_start|>assistant"
+
+video_path = "/raid/raushan/karate.mp4" # hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
+container = av.open(video_path)
+
+# sample uniformly 8 frames from the video, can sample more for longer videos
+total_frames = container.streams.video[0].frames
+indices = np.arange(0, total_frames, total_frames / 64).astype(int)
+clip = read_video_pyav(container, indices)
+
+clip, frame_time,video_time = load_video(video_path, max_frames_num=64, force_sample=True)
+inputs_video = processor(text=prompt, videos=clip, return_tensors="pt").to(device=model.device, dtype=torch.bfloat16)
+
+output = model.generate(**inputs_video, max_new_tokens=100, do_sample=False)
+print(processor.decode(output[0][2:], skip_special_tokens=True))
diff --git a/setup.py b/setup.py
index a78bb20dd0a4b0..952b68aa4b4223 100644
--- a/setup.py
+++ b/setup.py
@@ -182,7 +182,7 @@
     "tiktoken",
     "timm<=1.0.11",
     "tokenizers>=0.21,<0.22",
-    "torch",
+    "torch>=2.0",
     "torchaudio",
     "torchvision",
     "pyctcdecode>=0.4.0",
@@ -437,7 +437,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.48.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.49.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7df1af049de626..1cf0f88ad6f580 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.48.0.dev0"
+__version__ = "4.49.0.dev0"
 
 from typing import TYPE_CHECKING
 
@@ -402,6 +402,7 @@
     "models.depth_anything": ["DepthAnythingConfig"],
     "models.detr": ["DetrConfig"],
     "models.dialogpt": [],
+    "models.diffllama": ["DiffLlamaConfig"],
     "models.dinat": ["DinatConfig"],
     "models.dinov2": ["Dinov2Config"],
     "models.dinov2_with_registers": ["Dinov2WithRegistersConfig"],
@@ -427,6 +428,12 @@
         "ElectraConfig",
         "ElectraTokenizer",
     ],
+    "models.emu3": [
+        "Emu3Config",
+        "Emu3Processor",
+        "Emu3TextConfig",
+        "Emu3VQVAEConfig",
+    ],
     "models.encodec": [
         "EncodecConfig",
         "EncodecFeatureExtractor",
@@ -491,6 +498,7 @@
         "GroupViTTextConfig",
         "GroupViTVisionConfig",
     ],
+    "models.helium": ["HeliumConfig"],
     "models.herbert": ["HerbertTokenizer"],
     "models.hiera": ["HieraConfig"],
     "models.hubert": ["HubertConfig"],
@@ -609,6 +617,7 @@
     "models.mobilevit": ["MobileViTConfig"],
     "models.mobilevitv2": ["MobileViTV2Config"],
     "models.modernbert": ["ModernBertConfig"],
+    "models.moonshine": ["MoonshineConfig"],
     "models.moshi": [
         "MoshiConfig",
         "MoshiDepthConfig",
@@ -788,6 +797,7 @@
         "TapasConfig",
         "TapasTokenizer",
     ],
+    "models.textnet": ["TextNetConfig"],
     "models.time_series_transformer": ["TimeSeriesTransformerConfig"],
     "models.timesformer": ["TimesformerConfig"],
     "models.timm_backbone": ["TimmBackboneConfig"],
@@ -832,6 +842,8 @@
     "models.vit_msn": ["ViTMSNConfig"],
     "models.vitdet": ["VitDetConfig"],
     "models.vitmatte": ["VitMatteConfig"],
+    "models.vitpose": ["VitPoseConfig"],
+    "models.vitpose_backbone": ["VitPoseBackboneConfig"],
     "models.vits": [
         "VitsConfig",
         "VitsTokenizer",
@@ -1217,6 +1229,7 @@
     _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
     _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
     _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
+    _import_structure["models.emu3"].append("Emu3ImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
@@ -1257,12 +1270,14 @@
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.superpoint"].extend(["SuperPointImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.textnet"].extend(["TextNetImageProcessor"])
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.video_llava"].append("VideoLlavaImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
     _import_structure["models.vitmatte"].append("VitMatteImageProcessor")
+    _import_structure["models.vitpose"].append("VitPoseImageProcessor")
     _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
     _import_structure["models.zoedepth"].append("ZoeDepthImageProcessor")
@@ -2145,6 +2160,16 @@
             "DetrPreTrainedModel",
         ]
     )
+    _import_structure["models.diffllama"].extend(
+        [
+            "DiffLlamaForCausalLM",
+            "DiffLlamaForQuestionAnswering",
+            "DiffLlamaForSequenceClassification",
+            "DiffLlamaForTokenClassification",
+            "DiffLlamaModel",
+            "DiffLlamaPreTrainedModel",
+        ]
+    )
     _import_structure["models.dinat"].extend(
         [
             "DinatBackbone",
@@ -2226,6 +2251,15 @@
             "load_tf_weights_in_electra",
         ]
     )
+    _import_structure["models.emu3"].extend(
+        [
+            "Emu3ForCausalLM",
+            "Emu3ForConditionalGeneration",
+            "Emu3PreTrainedModel",
+            "Emu3TextModel",
+            "Emu3VQVAE",
+        ]
+    )
     _import_structure["models.encodec"].extend(
         [
             "EncodecModel",
@@ -2473,6 +2507,15 @@
             "GroupViTVisionModel",
         ]
     )
+    _import_structure["models.helium"].extend(
+        [
+            "HeliumForCausalLM",
+            "HeliumForSequenceClassification",
+            "HeliumForTokenClassification",
+            "HeliumModel",
+            "HeliumPreTrainedModel",
+        ]
+    )
     _import_structure["models.hiera"].extend(
         [
             "HieraBackbone",
@@ -2891,6 +2934,13 @@
             "ModernBertPreTrainedModel",
         ]
     )
+    _import_structure["models.moonshine"].extend(
+        [
+            "MoonshineForConditionalGeneration",
+            "MoonshineModel",
+            "MoonshinePreTrainedModel",
+        ]
+    )
     _import_structure["models.moshi"].extend(
         [
             "MoshiForCausalLM",
@@ -3573,6 +3623,14 @@
             "load_tf_weights_in_tapas",
         ]
     )
+    _import_structure["models.textnet"].extend(
+        [
+            "TextNetBackbone",
+            "TextNetForImageClassification",
+            "TextNetModel",
+            "TextNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.time_series_transformer"].extend(
         [
             "TimeSeriesTransformerForPrediction",
@@ -3734,6 +3792,18 @@
             "VitMattePreTrainedModel",
         ]
     )
+    _import_structure["models.vitpose"].extend(
+        [
+            "VitPoseForPoseEstimation",
+            "VitPosePreTrainedModel",
+        ]
+    )
+    _import_structure["models.vitpose_backbone"].extend(
+        [
+            "VitPoseBackbone",
+            "VitPoseBackbonePreTrainedModel",
+        ]
+    )
     _import_structure["models.vits"].extend(
         [
             "VitsModel",
@@ -5369,6 +5439,7 @@
     )
     from .models.depth_anything import DepthAnythingConfig
     from .models.detr import DetrConfig
+    from .models.diffllama import DiffLlamaConfig
     from .models.dinat import DinatConfig
     from .models.dinov2 import Dinov2Config
     from .models.dinov2_with_registers import Dinov2WithRegistersConfig
@@ -5395,6 +5466,12 @@
         ElectraConfig,
         ElectraTokenizer,
     )
+    from .models.emu3 import (
+        Emu3Config,
+        Emu3Processor,
+        Emu3TextConfig,
+        Emu3VQVAEConfig,
+    )
     from .models.encodec import (
         EncodecConfig,
         EncodecFeatureExtractor,
@@ -5462,6 +5539,7 @@
         GroupViTTextConfig,
         GroupViTVisionConfig,
     )
+    from .models.helium import HeliumConfig
     from .models.herbert import HerbertTokenizer
     from .models.hiera import HieraConfig
     from .models.hubert import HubertConfig
@@ -5596,6 +5674,7 @@
         MobileViTV2Config,
     )
     from .models.modernbert import ModernBertConfig
+    from .models.moonshine import MoonshineConfig
     from .models.moshi import (
         MoshiConfig,
         MoshiDepthConfig,
@@ -5801,6 +5880,7 @@
         TapasConfig,
         TapasTokenizer,
     )
+    from .models.textnet import TextNetConfig
     from .models.time_series_transformer import (
         TimeSeriesTransformerConfig,
     )
@@ -5854,6 +5934,8 @@
     from .models.vit_msn import ViTMSNConfig
     from .models.vitdet import VitDetConfig
     from .models.vitmatte import VitMatteConfig
+    from .models.vitpose import VitPoseConfig
+    from .models.vitpose_backbone import VitPoseBackboneConfig
     from .models.vits import (
         VitsConfig,
         VitsTokenizer,
@@ -6221,6 +6303,7 @@
         from .models.donut import DonutFeatureExtractor, DonutImageProcessor
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
         from .models.efficientnet import EfficientNetImageProcessor
+        from .models.emu3 import Emu3ImageProcessor
         from .models.flava import (
             FlavaFeatureExtractor,
             FlavaImageProcessor,
@@ -6281,12 +6364,14 @@
         from .models.siglip import SiglipImageProcessor
         from .models.superpoint import SuperPointImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
+        from .models.textnet import TextNetImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.video_llava import VideoLlavaImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.vitmatte import VitMatteImageProcessor
+        from .models.vitpose import VitPoseImageProcessor
         from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
         from .models.zoedepth import ZoeDepthImageProcessor
@@ -7017,6 +7102,14 @@
             DetrModel,
             DetrPreTrainedModel,
         )
+        from .models.diffllama import (
+            DiffLlamaForCausalLM,
+            DiffLlamaForQuestionAnswering,
+            DiffLlamaForSequenceClassification,
+            DiffLlamaForTokenClassification,
+            DiffLlamaModel,
+            DiffLlamaPreTrainedModel,
+        )
         from .models.dinat import (
             DinatBackbone,
             DinatForImageClassification,
@@ -7080,6 +7173,13 @@
             ElectraPreTrainedModel,
             load_tf_weights_in_electra,
         )
+        from .models.emu3 import (
+            Emu3ForCausalLM,
+            Emu3ForConditionalGeneration,
+            Emu3PreTrainedModel,
+            Emu3TextModel,
+            Emu3VQVAE,
+        )
         from .models.encodec import (
             EncodecModel,
             EncodecPreTrainedModel,
@@ -7282,6 +7382,13 @@
             GroupViTTextModel,
             GroupViTVisionModel,
         )
+        from .models.helium import (
+            HeliumForCausalLM,
+            HeliumForSequenceClassification,
+            HeliumForTokenClassification,
+            HeliumModel,
+            HeliumPreTrainedModel,
+        )
         from .models.hiera import (
             HieraBackbone,
             HieraForImageClassification,
@@ -7602,6 +7709,11 @@
             ModernBertModel,
             ModernBertPreTrainedModel,
         )
+        from .models.moonshine import (
+            MoonshineForConditionalGeneration,
+            MoonshineModel,
+            MoonshinePreTrainedModel,
+        )
         from .models.moshi import (
             MoshiForCausalLM,
             MoshiForConditionalGeneration,
@@ -8135,6 +8247,12 @@
             TapasPreTrainedModel,
             load_tf_weights_in_tapas,
         )
+        from .models.textnet import (
+            TextNetBackbone,
+            TextNetForImageClassification,
+            TextNetModel,
+            TextNetPreTrainedModel,
+        )
         from .models.time_series_transformer import (
             TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,
@@ -8256,6 +8374,11 @@
             VitMatteForImageMatting,
             VitMattePreTrainedModel,
         )
+        from .models.vitpose import (
+            VitPoseForPoseEstimation,
+            VitPosePreTrainedModel,
+        )
+        from .models.vitpose_backbone import VitPoseBackbone, VitPoseBackbonePreTrainedModel
         from .models.vits import (
             VitsModel,
             VitsPreTrainedModel,
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 648877c8dce962..dfb64fcd08698c 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -994,8 +994,11 @@ def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
         string, which can then be stored in the json format.
         """
-        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
-            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
+        if d.get("torch_dtype", None) is not None:
+            if isinstance(d["torch_dtype"], dict):
+                d["torch_dtype"] = {k: str(v).split(".")[-1] for k, v in d["torch_dtype"].items()}
+            elif not isinstance(d["torch_dtype"], str):
+                d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
         for value in d.values():
             if isinstance(value, dict):
                 self.dict_torch_dtype_to_str(value)
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 030e3a66643630..c821d2db636b27 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1446,6 +1446,95 @@ def pre_tokenizer(self, replacement, add_prefix_space):
         return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
 
 
+class HeliumConverter(SpmConverter):
+    handle_byte_fallback = True
+
+    def __init__(self, vocab_file=None, *args):
+        requires_backends(self, "protobuf")
+
+        Converter.__init__(self, vocab_file)
+
+        model_pb2 = import_protobuf()
+
+        m = model_pb2.ModelProto()
+        with open(vocab_file, "rb") as f:
+            m.ParseFromString(f.read())
+        self.proto = m
+
+    def tokenizer(self, proto):
+        vocab_scores = self.vocab(proto)
+        tokenizer = Tokenizer(
+            Unigram(
+                vocab_scores,
+                unk_id=self.unk_id(proto),
+                byte_fallback=self.handle_byte_fallback,
+            )
+        )
+        # control tokens are special
+        # user defined symbols are not
+        # both user and control tokens are AddedTokens
+        # Add user defined symbols (type == 4) from sentencepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
+        spm_added_tokens = [
+            (id, p.piece, p.type == 3 or p.piece in self.special_tokens)
+            for id, p in enumerate(proto.pieces)
+            if p.type in [3, 4]
+        ]
+        tokenizer.add_tokens(
+            [
+                AddedToken(token, normalized=False, special=special, single_word=True)
+                for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
+            ]
+        )
+        tokenizer.add_tokens([AddedToken("\n", normalized=False, special=False)])
+        tokenizer.enable_padding(pad_token="<pad>", pad_id=3)
+        return tokenizer
+
+    def vocab(self, proto):
+        vocab = []
+        for piece in proto.pieces:
+            if piece.piece == "<0x0A>":
+                vocab += [("\n", piece.score)]
+            else:
+                vocab += [(piece.piece, piece.score)]
+        return vocab
+
+    def unk_id(self, proto):
+        unk_id = 0
+        return unk_id
+
+    def decoder(self, replacement, add_prefix_space):
+        sequence = [
+            decoders.Replace("▁", " "),
+            decoders.ByteFallback(),
+            decoders.Fuse(),
+        ]
+        sequence += [decoders.Strip(content=" ", left=1)]
+        return decoders.Sequence(sequence)
+
+    def normalizer(self, proto):
+        return normalizers.Sequence([normalizers.Prepend(" "), normalizers.Replace(r" ", "▁")])
+
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return pre_tokenizers.Sequence([pre_tokenizers.Split("\n", "contiguous")])
+
+    def post_processor(self):
+        return processors.TemplateProcessing(
+            single=[
+                "<s>",
+                "$A",
+            ],
+            pair=[
+                "<s>",
+                "$A",
+                "<s>",
+                "$B",
+            ],
+            special_tokens=[
+                ("<s>", 1),
+            ],
+        )
+
+
 # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 def bytes_to_unicode():
     """
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index e84c9d0ef3ce2e..01a5896a2907e6 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -691,11 +691,17 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
             tokens and the value to predict for the masked token.
         mlm_probability (`float`, *optional*, defaults to 0.15):
             The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+        mask_replace_prob (`float`, *optional*, defaults to 0.8):
+            The probability with which masked tokens are replaced by the tokenizer's mask token (e.g., `[MASK]`).
+            Defaults to 0.8, meaning 80% of the masked tokens will be replaced with `[MASK]`.
+            Only works when `mlm` is set to `True`.
+        random_replace_prob (`float`, *optional*, defaults to 0.1):
+            The probability with which masked tokens are replaced by random tokens from the tokenizer's vocabulary.
+            Defaults to 0.1, meaning 10% of the masked tokens will be replaced with random tokens. The remaining
+            masked tokens (1 - mask_replace_prob - random_replace_prob) are left unchanged.
+            Only works when `mlm` is set to `True`.
         pad_to_multiple_of (`int`, *optional*):
-            If set will pad the sequence to a multiple of the provided value.
-
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.0 (Volta).
+            If set, will pad the sequence to a multiple of the provided value.
         return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
 
@@ -705,11 +711,36 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
     BatchEncoding, with the `"special_tokens_mask"` key, as returned by a [`PreTrainedTokenizer`] or a
     [`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
 
-    </Tip>"""
+    <Example Options and Expectations>
+
+    1. Default Behavior:
+        - `mask_replace_prob=0.8`, `random_replace_prob=0.1`.
+        - Expect 80% of masked tokens replaced with `[MASK]`, 10% replaced with random tokens, and 10% left unchanged.
+
+    2. All masked tokens replaced by `[MASK]`:
+        - `mask_replace_prob=1.0`, `random_replace_prob=0.0`.
+        - Expect all masked tokens to be replaced with `[MASK]`. No tokens are left unchanged or replaced with random tokens.
+
+    3. No `[MASK]` replacement, only random tokens:
+        - `mask_replace_prob=0.0`, `random_replace_prob=1.0`.
+        - Expect all masked tokens to be replaced with random tokens. No `[MASK]` replacements or unchanged tokens.
+
+    4. Balanced replacement:
+        - `mask_replace_prob=0.5`, `random_replace_prob=0.4`.
+        - Expect 50% of masked tokens replaced with `[MASK]`, 40% replaced with random tokens, and 10% left unchanged.
+
+    Note:
+        The sum of `mask_replace_prob` and `random_replace_prob` must not exceed 1. If their sum is less than 1, the
+        remaining proportion will consist of masked tokens left unchanged.
+
+    </Tip>
+    """
 
     tokenizer: PreTrainedTokenizerBase
     mlm: bool = True
     mlm_probability: float = 0.15
+    mask_replace_prob: float = 0.8
+    random_replace_prob: float = 0.1
     pad_to_multiple_of: Optional[int] = None
     tf_experimental_compile: bool = False
     return_tensors: str = "pt"
@@ -720,6 +751,15 @@ def __post_init__(self):
                 "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                 "You should pass `mlm=False` to train on causal language modeling instead."
             )
+        if self.mlm_probability < 0 or self.mlm_probability > 1:
+            raise ValueError("mlm_probability should be between 0 and 1.")
+        if self.mask_replace_prob + self.random_replace_prob > 1:
+            raise ValueError("The sum of mask_replace_prob and random_replace_prob should not exceed 1")
+        if self.mask_replace_prob < 0 or self.mask_replace_prob > 1:
+            raise ValueError("mask_replace_prob should be between 0 and 1.")
+        if self.random_replace_prob < 0 or self.random_replace_prob > 1:
+            raise ValueError("random_replace_prob should be between 0 and 1.")
+
         if self.tf_experimental_compile:
             import tensorflow as tf
 
@@ -749,18 +789,28 @@ def tf_mask_tokens(
         # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
         labels = tf.where(masked_indices, inputs, -100)
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob) & masked_indices
 
         inputs = tf.where(indices_replaced, mask_token_id, inputs)
 
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            self.tf_bernoulli(input_shape, random_replace_prob_scaled) & masked_indices & ~indices_replaced
+        )
         random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
 
         inputs = tf.where(indices_random, random_words, inputs)
 
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 
     def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
@@ -849,16 +899,29 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
         masked_indices = torch.bernoulli(probability_matrix).bool()
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob)).bool() & masked_indices
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled)).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
         random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
         inputs[indices_random] = random_words[indices_random]
 
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 
     def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
@@ -905,14 +968,24 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
         masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+        )
         inputs[indices_replaced] = self.tokenizer.mask_token_id
 
-        # 10% of the time, we replace masked input tokens with random word
-        # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
         indices_random = (
-            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
+            np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+            & masked_indices
+            & ~indices_replaced
         )
         random_words = np.random.randint(
             low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 6a737b805a456c..26500c22b167b1 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -87,7 +87,7 @@
     "tiktoken": "tiktoken",
     "timm": "timm<=1.0.11",
     "tokenizers": "tokenizers>=0.21,<0.22",
-    "torch": "torch",
+    "torch": "torch>=2.0",
     "torchaudio": "torchaudio",
     "torchvision": "torchvision",
     "pyctcdecode": "pyctcdecode>=0.4.0",
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 8e87ead7fdd5a9..134666d45a3a9e 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -347,7 +347,6 @@ def generate(
             eos_token_id = generation_config.eos_token_id
             if isinstance(eos_token_id, list):
                 eos_token_id = eos_token_id[0]
-            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
             generation_config.pad_token_id = eos_token_id
 
         if generation_config.decoder_start_token_id is None and self.config.is_encoder_decoder:
diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
index c78e259db38be8..7c25e343001577 100644
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import asyncio
 from queue import Queue
 from typing import TYPE_CHECKING, Optional
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index eb7396a8b5dc77..dd1f819fe548aa 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -773,7 +773,6 @@ def generate(
             eos_token_id = generation_config.eos_token_id
             if isinstance(eos_token_id, list):
                 eos_token_id = eos_token_id[0]
-            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
             generation_config.pad_token_id = eos_token_id
 
         use_xla = not tf.executing_eagerly()
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 4673b3b8bb413d..b6221ebe38d568 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1652,17 +1652,18 @@ def _get_cache(
                     cache_dtype = self.get_output_embeddings().weight.dtype
 
             def get_layer_device_map(execution_device_map: Optional[dict] = None):
+                num_hidden_layers = self.config.get_text_config().num_hidden_layers
                 if execution_device_map is None:
                     return None
                 elif len(execution_device_map) == 1 and "" in execution_device_map:
-                    return {idx: execution_device_map[""] for idx in range(self.config.num_hidden_layers)}
+                    return {idx: execution_device_map[""] for idx in range(num_hidden_layers)}
                 layer_device_map = {}
                 for layer in execution_device_map:
-                    for idx in range(self.config.num_hidden_layers):
+                    for idx in range(num_hidden_layers):
                         if f".{idx}." in f"{layer}.":
                             layer_device_map[idx] = execution_device_map[layer]
                             break
-                for idx in range(self.config.num_hidden_layers):
+                for idx in range(num_hidden_layers):
                     if idx not in layer_device_map:
                         raise RuntimeError(f"layer {idx} has not been mapped to a device.")
                 return layer_device_map
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 51199d9f3698fc..90b5f44c563938 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -15,6 +15,7 @@
 
 import base64
 import os
+from contextlib import redirect_stdout
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -25,6 +26,9 @@
 from .utils import (
     ExplicitEnum,
     TensorType,
+    is_av_available,
+    is_cv2_available,
+    is_decord_available,
     is_jax_tensor,
     is_numpy_array,
     is_tf_tensor,
@@ -32,6 +36,7 @@
     is_torch_tensor,
     is_torchvision_available,
     is_vision_available,
+    is_yt_dlp_available,
     logging,
     requires_backends,
     to_numpy,
@@ -56,6 +61,7 @@
         PILImageResampling = PIL.Image
 
     if is_torchvision_available():
+        from torchvision import io as torchvision_io
         from torchvision.transforms import InterpolationMode
 
         pil_torch_interpolation_mapping = {
@@ -67,6 +73,17 @@
             PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
         }
 
+if is_decord_available():
+    from decord import VideoReader, cpu
+
+if is_av_available():
+    import av
+
+if is_cv2_available():
+    import cv2
+
+if is_yt_dlp_available():
+    from yt_dlp import YoutubeDL
 
 if TYPE_CHECKING:
     if is_torch_available():
@@ -386,6 +403,204 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
     return image
 
 
+def get_uniform_frame_indices(total_num_frames: int, num_frames: Optional[int] = None):
+    """
+    Creates a numpy array for uniform sampling of `num_frame` frames from `total_num_frames`
+    when loading a video.
+
+    Args:
+        total_num_frames (`int`):
+            Total number of frames that a video has.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+
+    Returns:
+        np.ndarray: np array of frame indices that will be sampled.
+    """
+    if num_frames is not None:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int)
+    else:
+        indices = np.arange(0, total_num_frames).astype(int)
+    return indices
+
+
+def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
+    """
+    Decode the video with open-cv decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    video = cv2.VideoCapture(video_path)
+    total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
+
+    index = 0
+    frames = []
+    while video.isOpened():
+        success, frame = video.read()
+        if index in indices:
+            height, width, channel = frame.shape
+            frames.append(frame[0:height, 0:width, 0:channel])
+        if success:
+            index += 1
+        if index >= total_num_frames:
+            break
+
+    video.release()
+    return np.stack(frames)
+
+
+def read_video_decord(video_path: str, num_frames: Optional[int] = None):
+    """
+    Decode the video with Decord decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
+    indices = get_uniform_frame_indices(total_num_frames=len(vr), num_frames=num_frames)
+    frames = vr.get_batch(indices).asnumpy()
+    return frames
+
+
+def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
+    """
+    Decode the video with PyAV decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    container = av.open(video_path)
+
+    # sample uniformly "num_frames" frames from the video
+    total_num_frames = container.streams.video[0].frames
+    indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
+
+    frames = []
+    container.seek(0)
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= 0 and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
+    """
+    Decode the video with torchvision decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not specified, all frames are sampled.
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    video, _, info = torchvision_io.read_video(
+        video_path,
+        start_pts=0.0,
+        end_pts=None,
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+
+    if num_frames is not None:
+        idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
+        return video[idx]
+
+    return video
+
+
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "opencv": read_video_opencv,
+    "pyav": read_video_pyav,
+    "torchvision": read_video_torchvision,
+}
+
+
+def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None, backend: str = "opencv") -> np.array:
+    """
+    Loads `video` to a numpy array.
+
+    Args:
+        video (`str` or `VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not passed, the whole video is loaded.
+        backend (`str`, *optional*, defaults to `"opencv"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
+
+    Returns:
+        `np.array`: A numpy array of shape (num_frames, channels, height, width).
+    """
+    if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        buffer = BytesIO()
+        with redirect_stdout(buffer), YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    elif is_valid_image(video) or (isinstance(video, (list, tuple) and is_valid_image(video[0]))):
+        file_obj = None
+    else:
+        raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
+
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend in ["opencv", "torchvision"]:
+        raise ValueError(
+            "If you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend"
+        )
+
+    if file_obj is None:
+        return video
+
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_av_available() and backend == "pyav")
+        or (not is_cv2_available() and backend == "opencv")
+        or (not is_torchvision_available() and backend == "torchvision")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+
+    video_decoder = VIDEO_DECODERS[backend]
+    video = video_decoder(file_obj)
+    return video
+
+
 def load_images(
     images: Union[List, Tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None
 ) -> Union["PIL.Image.Image", List["PIL.Image.Image"], List[List["PIL.Image.Image"]]]:
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index e0149decde3101..49dbc5e3ad90e1 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -57,7 +57,6 @@
     "fsdp": ["is_fsdp_managed_module"],
     "ggml": [
         "GGUF_CONFIG_MAPPING",
-        "GGUF_TENSOR_MAPPING",
         "GGUF_TOKENIZER_MAPPING",
         "_gguf_parse_value",
         "load_dequant_gguf_tensor",
@@ -161,7 +160,6 @@
     from .fsdp import is_fsdp_managed_module
     from .ggml import (
         GGUF_CONFIG_MAPPING,
-        GGUF_TENSOR_MAPPING,
         GGUF_TOKENIZER_MAPPING,
         _gguf_parse_value,
         load_dequant_gguf_tensor,
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
index 66ffc5638838cb..53bd80bf045d4f 100644
--- a/src/transformers/integrations/flex_attention.py
+++ b/src/transformers/integrations/flex_attention.py
@@ -27,7 +27,7 @@ def causal_mod(score, b, h, q_idx, kv_idx):
         if softcap is not None:
             score = softcap * torch.tanh(score / softcap)
         if causal_mask is not None:
-            score += causal_mask[b][0][q_idx][kv_idx]
+            score = score + causal_mask[b][0][q_idx][kv_idx]
         return score
 
     attn_output, attention_weights = flex_attention(
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 6bb939e6459239..e88071b6a02ee3 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -33,254 +33,6 @@
 logger = logging.get_logger(__name__)
 
 
-GGUF_TENSOR_MAPPING = {
-    "llama": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "mistral": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "qwen2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "qwen2moe": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up_exps": "mlp.experts",
-        "ffn_up_shexp": "mlp.shared_expert.up_proj",
-        "ffn_down_exps": "mlp.experts",
-        "ffn_down_shexp": "mlp.shared_expert.down_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "ffn_gate_inp.weight": "mlp.gate.weight",
-        "ffn_gate_exps": "mlp.experts",
-        "ffn_gate_shexp": "mlp.shared_expert.gate_proj",
-        "ffn_gate_inp_shexp": "mlp.shared_expert_gate",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "phi3": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.gate_up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_up_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attn.qkv_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "bloom": {
-        "token_embd.weight": "transformer.word_embeddings.weight",
-        "token_embd_norm": "transformer.word_embeddings_layernorm",
-        "blk": "transformer.h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        "output.weight": "lm_head.weight",
-        "output_norm": "transformer.ln_f",
-    },
-    "falcon7b": {
-        "token_embd": "word_embeddings",
-        "blk": "h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        ".output.": ".lm_head.",
-        "output_norm": "ln_f",
-    },
-    "falcon40b": {
-        "token_embd": "word_embeddings",
-        "blk": "h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        ".attn_norm.": ".ln_mlp.",
-        "attn_norm_2": "ln_attn",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        ".output.": ".lm_head.",
-        "output_norm": "ln_f",
-    },
-    "t5": {
-        "token_embd": "shared",
-        "dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q",
-        "dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k",
-        "dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v",
-        "dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o",
-        "dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm",
-        "dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q",
-        "dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k",
-        "dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v",
-        "dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o",
-        "dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm",
-        "dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0",
-        "dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",
-        "dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo",
-        "dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm",
-        "dec.output_norm": "decoder.final_layer_norm",
-        "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
-        "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
-        "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
-        "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
-        "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
-        "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
-        "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
-        "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
-        "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
-        "enc.output_norm": "encoder.final_layer_norm",
-        "output.weight": "lm_head.weight",
-    },
-    "t5encoder": {
-        "token_embd": "shared",
-        "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
-        "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
-        "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
-        "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
-        "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
-        "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
-        "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
-        "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
-        "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
-        "enc.output_norm": "encoder.final_layer_norm",
-    },
-    "stablelm": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "gpt2": {
-        "token_embd": "transformer.wte",
-        "blk": "transformer.h",
-        "position_embd": "transformer.wpe",
-        "output_norm": "transformer.ln_f",
-        "attn_norm": "ln_1",
-        "attn_qkv": "attn.c_attn",
-        "attn_output.weight": "attn.c_proj.weight",
-        "attn_output.bias": "attn.c_proj.bias",
-        "ffn_norm": "ln_2",
-        "ffn_up": "mlp.c_fc",
-        "ffn_down": "mlp.c_proj",
-    },
-    "starcoder2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.c_fc",
-        "ffn_down": "mlp.c_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "mamba": {
-        "token_embd": "backbone.embeddings",
-        "blk": "backbone.layers",
-        "ssm_a": "mixer.A_log",
-        "ssm_conv1d": "mixer.conv1d",
-        "ssm_in": "mixer.in_proj",
-        "ssm_out": "mixer.out_proj",
-        "ssm_x": "mixer.x_proj",
-        "ssm_dt": "mixer.dt_proj",
-        "attn_norm": "norm",
-        "output_norm": "backbone.norm_f",
-        "output.weight": "lm_head.weight",
-    },
-    "nemotron": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "gemma2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "pre_feedforward_layernorm",
-        "post_attention_norm": "post_attention_layernorm",
-        "post_ffw_norm": "post_feedforward_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output_norm": "model.norm",
-    },
-}
-
-
 GGUF_CONFIG_MAPPING = {
     "general": {
         "architecture": "model_type",
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index e1f3bccb842fd1..55cefad6874295 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -227,6 +227,7 @@ def hp_params(trial):
 
 def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
     import optuna
+    from accelerate.utils.memory import release_memory
 
     if trainer.args.process_index == 0:
 
@@ -251,6 +252,11 @@ def _objective(trial: optuna.Trial, checkpoint_dir=None):
             if getattr(trainer, "objective", None) is None:
                 metrics = trainer.evaluate()
                 trainer.objective = trainer.compute_objective(metrics)
+
+            # Free GPU memory
+            trainer.model_wrapped, trainer.model = release_memory(trainer.model_wrapped, trainer.model)
+            trainer.accelerator.clear()
+
             return trainer.objective
 
         timeout = kwargs.pop("timeout", None)
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index 69e674a2160643..791528a629acf4 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import importlib
 import inspect
 import warnings
@@ -525,3 +526,64 @@ def _dispatch_accelerate_model(
             offload_dir=offload_folder,
             **dispatch_model_kwargs,
         )
+
+    def delete_adapter(self, adapter_names: Union[List[str], str]) -> None:
+        """
+        Delete an adapter's LoRA layers from the underlying model.
+
+        Args:
+            adapter_names (`Union[List[str], str]`):
+                The name(s) of the adapter(s) to delete.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic"
+        )
+        pipeline.delete_adapters("cinematic")
+        ```
+        """
+
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        # Check that all adapter names are present in the config
+        missing_adapters = [name for name in adapter_names if name not in self.peft_config]
+        if missing_adapters:
+            raise ValueError(
+                f"The following adapter(s) are not present and cannot be deleted: {', '.join(missing_adapters)}"
+            )
+
+        for adapter_name in adapter_names:
+            for module in self.modules():
+                if isinstance(module, BaseTunerLayer):
+                    if hasattr(module, "delete_adapter"):
+                        module.delete_adapter(adapter_name)
+                    else:
+                        raise ValueError(
+                            "The version of PEFT you are using is not compatible, please use a version that is greater than 0.6.1"
+                        )
+
+            # For transformers integration - we need to pop the adapter from the config
+            if getattr(self, "_hf_peft_config_loaded", False) and hasattr(self, "peft_config"):
+                self.peft_config.pop(adapter_name, None)
+
+        # In case all adapters are deleted, we need to delete the config
+        # and make sure to set the flag to False
+        if len(self.peft_config) == 0:
+            del self.peft_config
+            self._hf_peft_config_loaded = False
diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
index 7f6aaaa44264ca..2e8e2bb5f149d7 100644
--- a/src/transformers/loss/loss_utils.py
+++ b/src/transformers/loss/loss_utils.py
@@ -34,6 +34,7 @@ def ForCausalLMLoss(
 ):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
+    labels = labels.to(logits.device)
     # Shift so that tokens < n predict n
     shift_logits = logits[..., :-1, :].contiguous()
     shift_labels = labels[..., 1:].contiguous()
@@ -52,6 +53,7 @@ def ForMaskedLMLoss(
 ):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
+    labels = labels.to(logits.device)
 
     # Flatten the tokens
     logits = logits.view(-1, vocab_size)
@@ -73,6 +75,7 @@ def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs):
         else:
             config.problem_type = "multi_label_classification"
 
+    labels = labels.to(pooled_logits.device)
     if config.problem_type == "regression":
         loss_fct = MSELoss()
         if num_labels == 1:
@@ -109,7 +112,7 @@ def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_posi
 def ForTokenClassification(logits, labels, config, **kwargs):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.view(-1, config.num_labels)
-    labels = labels.view(-1)
+    labels = labels.view(-1).to(logits.device)
     logits = logits.float()
     # Flatten the tokens
     return fixed_cross_entropy(logits, labels, **kwargs)
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index 5565fb152bc3a7..0da06a1f582ade 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -22,7 +22,6 @@
 
 from .integrations import (
     GGUF_CONFIG_MAPPING,
-    GGUF_TENSOR_MAPPING,
     GGUF_TOKENIZER_MAPPING,
     _gguf_parse_value,
 )
@@ -47,12 +46,11 @@
         "general": {"file_type": "file_type", "quantization_version": "quantization_version"},
     },
     "config": GGUF_CONFIG_MAPPING,
-    "tensors": GGUF_TENSOR_MAPPING,
     "tokenizer": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer"]},
     "tokenizer_config": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer_config"]},
 }
 
-GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())
+GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["config"].keys())
 
 
 class GGUFTensor(NamedTuple):
@@ -121,21 +119,10 @@ def _split_moe_expert_tensor(
     ):
         # Original merge implementation
         # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
-        exp_name = ""
-        if "ffn_gate_exps" in name:
-            exp_name = "gate_proj"
-        elif "ffn_down_exps" in name:
-            exp_name = "down_proj"
-        elif "ffn_up_exps" in name:
-            exp_name = "up_proj"
-        else:
-            raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
-        for tensor_name in tensor_key_mapping:
-            if tensor_name in name:
-                name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
+        name = tensor_key_mapping[name]
         w_counter = self.config.get("num_experts", 60)
         for i in range(0, w_counter):
-            temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
+            temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.")
             exp_weight = weights[i]
             parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
 
@@ -223,10 +210,6 @@ def __init__(self, config=None):
         super().__init__(config=config)
 
     def process(self, weights, name, **kwargs):
-        if "ssm_d" in name and "bias" not in name and "weight" not in name:
-            # ssm_d has conflicts with ssm_dt in name checking
-            # we have to explicitly check that name is exactly ssm_d
-            name = name.replace("ssm_d", "mixer.D")
         if "ssm_conv1d.weight" in name:
             # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
             # quantized one is (5120, 4)
@@ -238,6 +221,17 @@ def process(self, weights, name, **kwargs):
         return GGUFTensor(weights, name, {})
 
 
+class NemotronTensorProcessor(TensorProcessor):
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    # ref : https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L4666
+    def process(self, weights, name, **kwargs):
+        if "norm.weight" in name:
+            weights = weights - 1
+        return GGUFTensor(weights, name, {})
+
+
 class Gemma2TensorProcessor(TensorProcessor):
     def __init__(self, config=None):
         super().__init__(config=config)
@@ -258,6 +252,7 @@ def process(self, weights, name, **kwargs):
     "t5encoder": T5TensorProcessor,
     "gpt2": GPT2TensorProcessor,
     "mamba": MambaTensorProcessor,
+    "nemotron": NemotronTensorProcessor,
     "gemma2": Gemma2TensorProcessor,
 }
 
@@ -267,7 +262,84 @@ def read_field(reader, field):
     return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
 
 
-def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+# modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147
+def get_gguf_hf_weights_map(
+    hf_model,
+    model_type: Optional[str] = None,
+    num_layers: Optional[int] = None,
+    qual_name: str = "",
+):
+    """
+    GGUF uses this naming convention for their tensors from HF checkpoint:
+    `blk.N.BB.weight` and `blk.N.BB.bias`
+    where N signifies the block number of a layer, and BB signifies the
+    attention/mlp layer components.
+    See "Standardized tensor names" in
+    https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
+    """
+    if is_gguf_available() and is_torch_available():
+        from gguf import MODEL_ARCH_NAMES, get_tensor_name_map
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+
+    model_type = hf_model.config.model_type if model_type is None else model_type
+    num_layers = hf_model.config.num_hidden_layers if num_layers is None else num_layers
+    # hack: ggufs have a different name for cohere
+    if model_type == "cohere":
+        model_type = "command-r"
+    if model_type == "qwen2_moe":
+        model_type = "qwen2moe"
+    arch = None
+    for key, value in MODEL_ARCH_NAMES.items():
+        if value == model_type:
+            arch = key
+            break
+    if arch is None:
+        raise NotImplementedError(
+            f"Unknown gguf model_type: {model_type} in gguf-py. "
+            "This might because you're using an outdated version of gguf-py package, "
+            "you can install `gguf` package from source refer to "
+            "https://github.com/ggerganov/llama.cpp/tree/master/gguf-py#development"
+        )
+    name_map = get_tensor_name_map(arch, num_layers)
+
+    # Use a dummy conversion to get the mapping, because
+    # hf => gguf and gguf => hf mappings are reversed
+    gguf_to_hf_name_map = {}
+    state_dict = hf_model.state_dict()
+    for hf_name in state_dict.keys():
+        # An exception for qwen2moe model, where the expert layers are packed
+        if model_type == "qwen2moe" and "mlp.experts." in hf_name:
+            hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
+
+        name, suffix = hf_name, ""
+        if hf_name.endswith(".weight") or hf_name.endswith(".bias"):
+            name, suffix = hf_name.rsplit(".", 1)
+            suffix = "." + suffix
+
+        gguf_name = name_map.get_name(name)
+        if gguf_name is None:
+            continue
+
+        gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name
+
+    # Some model like Bloom converted from BloomModel instead of BloomForCausalLM
+    # Therefore, we need to check submodule as well to get a correct mapping
+    if named_children := hf_model.named_children():
+        for name, child in named_children:
+            sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.")
+            # Ignore the keys that are already in the main map to avoid overwriting
+            sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map}
+            gguf_to_hf_name_map.update(sub_map)
+
+    return gguf_to_hf_name_map
+
+
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None):
     """
     Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
     tokenizer and config attributes.
@@ -323,20 +395,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
         parsed_parameters["config"]["use_qkv_bias"] = qkv_bias
         parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
 
-    model_size = ""
-    # extract the number of params from file name as architectures can differ ;
-    # eg. for falcon : `...falcon-7b-...`
-    if "falcon" in architecture:
-        gguf_file_name = gguf_checkpoint_path.split("/")[-1].lower()
-        m = re.search(r"-\d+b-", gguf_file_name)  # regex to catch `-7b-`
-        if m is None:
-            raise ValueError(
-                f"From file name, cannot determine the number of parameters for {architecture} architecture"
-            )
-        model_size = m.group().strip("-")  # only keeps `7b`
-
-    if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES:
-        raise ValueError(f"Architecture {architecture + model_size} not supported")
+    if architecture not in GGUF_SUPPORTED_ARCHITECTURES:
+        raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.")
 
     # Handle tie_word_embeddings, if lm_head.weight is not present in tensors,
     # tie_word_embeddings is true otherwise false
@@ -388,7 +448,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
             )
 
     if return_tensors:
-        tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
+        parsed_parameters["tensors"] = {}
+
+        tensor_key_mapping = get_gguf_hf_weights_map(model_to_load)
         config = parsed_parameters.get("config", {})
 
         ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
@@ -407,16 +469,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
 
             weights = result.weights
             name = result.name
-            bid = result.metadata.get("bid")
 
-            if name is None:
+            if name not in tensor_key_mapping:
                 continue
 
-            for tensor_name in tensor_key_mapping:
-                if tensor_name.format(bid=bid) in name:
-                    name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid))
+            name = tensor_key_mapping[name]
 
-            # Use copy to avoid errors with numpy and pytorch
             parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
 
     if len(reader_keys) > 0:
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
index c617420a5896de..b2d343e0237fa0 100644
--- a/src/transformers/modeling_rope_utils.py
+++ b/src/transformers/modeling_rope_utils.py
@@ -279,25 +279,20 @@ def _compute_longrope_parameters(
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
     if hasattr(config, "original_max_position_embeddings"):
-        if seq_len and seq_len < config.original_max_position_embeddings:
-            expanded_max_position_embeddings = config.original_max_position_embeddings
-        else:
-            expanded_max_position_embeddings = config.max_position_embeddings
-        max_position_embeddings = config.original_max_position_embeddings
-        factor = expanded_max_position_embeddings / max_position_embeddings
+        original_max_position_embeddings = config.original_max_position_embeddings
+        factor = config.max_position_embeddings / config.original_max_position_embeddings
     else:
-        max_position_embeddings = config.max_position_embeddings
-        expanded_max_position_embeddings = max_position_embeddings * factor
+        original_max_position_embeddings = config.max_position_embeddings
 
     # Sets the attention factor as suggested in the paper
     if attention_factor is None:
         if factor <= 1.0:
             attention_factor = 1.0
         else:
-            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
 
     # Compute the inverse frequencies -- scaled based on the target sequence length
-    if expanded_max_position_embeddings > max_position_embeddings:
+    if seq_len and seq_len > original_max_position_embeddings:
         ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
     else:
         ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index ead3f1a03717dd..3e8d73f94b13ec 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1312,13 +1312,23 @@ def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
                 "`PretrainedConfig`. To create a model from a pretrained model use "
                 f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
-        # Save config and origin of the pretrained weights if given in model
         if not getattr(config, "_attn_implementation_autoset", False):
-            config = self._autoset_attn_implementation(
-                config, torch_dtype=torch.get_default_dtype(), check_device_map=False
-            )
+            # config usually has a `torch_dtype` but we need the next line for the `no_super_init` tests
+            dtype = config.torch_dtype if hasattr(config, "torch_dtype") else torch.get_default_dtype()
+            config = self._autoset_attn_implementation(config, torch_dtype=dtype, check_device_map=False)
         self.config = config
 
+        # for initialization of the loss
+        loss_type = self.__class__.__name__
+        if loss_type not in LOSS_MAPPING:
+            loss_groups = f"({'|'.join(LOSS_MAPPING)})"
+            loss_type = re.findall(loss_groups, self.__class__.__name__)
+            if len(loss_type) > 0:
+                loss_type = loss_type[0]
+            else:
+                loss_type = None
+        self.loss_type = loss_type
+
         self.name_or_path = config.name_or_path
         self.warnings_issued = {}
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
@@ -1400,7 +1410,10 @@ def _from_config(cls, config, **kwargs):
         # when we init a model from within another model (e.g. VLMs) and dispatch on FA2
         # a warning is raised that dtype should be fp16. Since we never pass dtype from within
         # modeling code, we can try to infer it here same way as done in `from_pretrained`
-        torch_dtype = kwargs.pop("torch_dtype", torch.get_default_dtype())
+        torch_dtype = kwargs.pop("torch_dtype", config.torch_dtype)
+        if isinstance(torch_dtype, str):
+            torch_dtype = getattr(torch, torch_dtype)
+
         use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
 
         # override default dtype if needed
@@ -1849,7 +1862,7 @@ def tie_weights(self):
         If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
         weights instead.
         """
-        if getattr(self.config, "tie_word_embeddings", True):
+        if getattr(self.config.get_text_config(decoder=True), "tie_word_embeddings", True):
             output_embeddings = self.get_output_embeddings()
             if output_embeddings is not None:
                 self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
@@ -2091,7 +2104,10 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean
                 new_num_tokens = new_embeddings.weight.shape[0]
 
         # if word embeddings are not tied, make sure that lm head is resized as well
-        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+        if (
+            self.get_output_embeddings() is not None
+            and not self.config.get_text_config(decoder=True).tie_word_embeddings
+        ):
             old_lm_head = self.get_output_embeddings()
             if isinstance(old_lm_head, torch.nn.Embedding):
                 new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
@@ -3917,7 +3933,10 @@ def from_pretrained(
 
                 gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs)
 
-            state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True)["tensors"]
+            # we need a dummy model to help rename state_dict
+            with torch.device("meta"):
+                dummy_model = cls(config)
+            state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True, model_to_load=dummy_model)["tensors"]
 
             resolved_archive_file = None
             is_sharded = False
@@ -4006,11 +4025,37 @@ def from_pretrained(
                             )
                     elif hasattr(torch, torch_dtype):
                         torch_dtype = getattr(torch, torch_dtype)
-                    else:
-                        raise ValueError(
-                            f'`torch_dtype` can be one of: `torch.dtype`, `"auto"` or a string of a valid `torch.dtype`, but received {torch_dtype}'
-                        )
+                        for sub_config_key in config.sub_configs.keys():
+                            sub_config = getattr(config, sub_config_key)
+                            sub_config.torch_dtype = torch_dtype
+                elif isinstance(torch_dtype, torch.dtype):
+                    pass
+                elif isinstance(torch_dtype, dict):
+                    for key, curr_dtype in torch_dtype.items():
+                        if hasattr(config, key):
+                            value = getattr(config, key)
+                            value.torch_dtype = curr_dtype
+                    # main torch dtype for modules that aren't part of any sub-config
+                    torch_dtype = torch_dtype.get("")
+                    config.torch_dtype = torch_dtype
+                    if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
+                        torch_dtype = getattr(torch, torch_dtype)
+                    elif torch_dtype is None:
+                        torch_dtype = torch.float32
+                else:
+                    raise ValueError(
+                        f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` "
+                        f"for each sub-config in composite configs, but received {torch_dtype}"
+                    )
+
                 dtype_orig = cls._set_default_torch_dtype(torch_dtype)
+            else:
+                # set fp32 as the default dtype for BC
+                default_dtype = str(torch.get_default_dtype()).split(".")[-1]
+                config.torch_dtype = default_dtype
+                for key in config.sub_configs.keys():
+                    value = getattr(config, key)
+                    value.torch_dtype = default_dtype
 
             # Check if `_keep_in_fp32_modules` is not None
             use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
@@ -4476,6 +4521,11 @@ def _load_pretrained_model(
             model_buffers = {".".join([prefix, key]) for key in model_buffers}
         unexpected_keys = sorted(unexpected_keys - model_buffers)
 
+        # Clean up buffer for `inv-freq` because RoPE embedding moved under base model (https://github.com/huggingface/transformers/pull/34858)
+        has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer in model_buffers)
+        if has_inv_freq_buffers:
+            unexpected_keys = {k for k in unexpected_keys if "rotary_emb.inv_freq" not in k}
+
         model.tie_weights()
         if device_map is None and not is_fsdp_enabled() and not is_deepspeed_zero3_enabled():
             ptrs = collections.defaultdict(list)
@@ -4557,7 +4607,10 @@ def _load_pretrained_model(
                     _loaded_keys = loaded_keys
                 not_initialized_submodules = set_initialized_submodules(model, _loaded_keys)
                 # If we're about to tie the output embeds to the input embeds we don't need to init them
-                if hasattr(model.config, "tie_word_embeddings") and model.config.tie_word_embeddings:
+                if (
+                    hasattr(model.config.get_text_config(decoder=True), "tie_word_embeddings")
+                    and model.config.get_text_config(decoder=True).tie_word_embeddings
+                ):
                     output_embeddings = model.get_output_embeddings()
                     if output_embeddings is not None:
                         # Still need to initialize if there is a bias term since biases are not tied.
@@ -5102,18 +5155,9 @@ def tplize(mod: torch.nn.Module) -> None:
 
     @property
     def loss_function(self):
-        if getattr(self.config, "loss_type", None) is not None:
-            loss_type = self.config.loss_type
-        else:
-            loss_type = self.__class__.__name__
-            if loss_type not in LOSS_MAPPING:
-                loss_groups = f"({'|'.join(LOSS_MAPPING)})"
-                loss_type = re.findall(loss_groups, self.__class__.__name__)
-                if len(loss_type) > 0:
-                    loss_type = loss_type[0]
-                else:
-                    loss_type = None
-        if loss_type is None or loss_type not in LOSS_MAPPING and getattr(self.config, "loss_type", None) is not None:
+        loss_type = getattr(self, "loss_type", None)
+
+        if loss_type is None or loss_type not in LOSS_MAPPING:
             logger.warning_once(
                 f"`loss_type={loss_type}` was set in the config but it is unrecognised."
                 f"Using the default loss: `ForCausalLMLoss`."
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index ff03d09966a4d6..b84a6c6e48126f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -75,6 +75,7 @@
     depth_anything,
     detr,
     dialogpt,
+    diffllama,
     dinat,
     dinov2,
     dinov2_with_registers,
@@ -85,6 +86,7 @@
     dpt,
     efficientnet,
     electra,
+    emu3,
     encodec,
     encoder_decoder,
     ernie,
@@ -115,6 +117,7 @@
     granitemoe,
     grounding_dino,
     groupvit,
+    helium,
     herbert,
     hiera,
     hubert,
@@ -169,6 +172,7 @@
     mobilevit,
     mobilevitv2,
     modernbert,
+    moonshine,
     moshi,
     mpnet,
     mpt,
@@ -251,6 +255,7 @@
     t5,
     table_transformer,
     tapas,
+    textnet,
     time_series_transformer,
     timesformer,
     timm_backbone,
@@ -275,6 +280,8 @@
     vit_msn,
     vitdet,
     vitmatte,
+    vitpose,
+    vitpose_backbone,
     vits,
     vivit,
     wav2vec2,
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index b96697bc0779e6..0b330b4aeeda2c 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -704,6 +704,7 @@ class AriaPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -723,13 +724,8 @@ def _init_weights(self, module):
 
 
 class AriaTextRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: AriaTextConfig,
-        device=None,
-    ):
+    def __init__(self, config: AriaTextConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -741,7 +737,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -753,13 +749,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -1363,6 +1360,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
     config_class = AriaConfig
     _supports_flash_attn_2 = False
     _supports_sdpa = False
+    _tied_weights_keys = ["language_model.lm_head.weight"]
 
     def __init__(self, config: AriaConfig):
         super().__init__(config)
@@ -1409,9 +1407,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 78c6e08bdfd0e5..295e2dcb7465b1 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -1337,6 +1337,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
     config_class = AriaConfig
     _supports_flash_attn_2 = False
     _supports_sdpa = False
+    _tied_weights_keys = ["language_model.lm_head.weight"]
 
     def __init__(self, config: AriaConfig):
         super().__init__(config)
@@ -1383,9 +1384,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6c052aa0eaa0f3..659fc4415121ed 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -92,6 +92,7 @@
         ("depth_anything", "DepthAnythingConfig"),
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
+        ("diffllama", "DiffLlamaConfig"),
         ("dinat", "DinatConfig"),
         ("dinov2", "Dinov2Config"),
         ("dinov2_with_registers", "Dinov2WithRegistersConfig"),
@@ -102,6 +103,7 @@
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
+        ("emu3", "Emu3Config"),
         ("encodec", "EncodecConfig"),
         ("encoder-decoder", "EncoderDecoderConfig"),
         ("ernie", "ErnieConfig"),
@@ -135,6 +137,7 @@
         ("graphormer", "GraphormerConfig"),
         ("grounding-dino", "GroundingDinoConfig"),
         ("groupvit", "GroupViTConfig"),
+        ("helium", "HeliumConfig"),
         ("hiera", "HieraConfig"),
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
@@ -189,6 +192,7 @@
         ("mobilevit", "MobileViTConfig"),
         ("mobilevitv2", "MobileViTV2Config"),
         ("modernbert", "ModernBertConfig"),
+        ("moonshine", "MoonshineConfig"),
         ("moshi", "MoshiConfig"),
         ("mpnet", "MPNetConfig"),
         ("mpt", "MptConfig"),
@@ -278,6 +282,7 @@
         ("t5", "T5Config"),
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
+        ("textnet", "TextNetConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
@@ -307,6 +312,8 @@
         ("vit_msn", "ViTMSNConfig"),
         ("vitdet", "VitDetConfig"),
         ("vitmatte", "VitMatteConfig"),
+        ("vitpose", "VitPoseConfig"),
+        ("vitpose_backbone", "VitPoseBackboneConfig"),
         ("vits", "VitsConfig"),
         ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -403,6 +410,7 @@
         ("deta", "DETA"),
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
+        ("diffllama", "DiffLlama"),
         ("dinat", "DiNAT"),
         ("dinov2", "DINOv2"),
         ("dinov2_with_registers", "DINOv2 with Registers"),
@@ -414,6 +422,7 @@
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
+        ("emu3", "Emu3"),
         ("encodec", "EnCodec"),
         ("encoder-decoder", "Encoder decoder"),
         ("ernie", "ERNIE"),
@@ -450,6 +459,7 @@
         ("graphormer", "Graphormer"),
         ("grounding-dino", "Grounding DINO"),
         ("groupvit", "GroupViT"),
+        ("helium", "Helium"),
         ("herbert", "HerBERT"),
         ("hiera", "Hiera"),
         ("hubert", "Hubert"),
@@ -514,6 +524,7 @@
         ("mobilevit", "MobileViT"),
         ("mobilevitv2", "MobileViTV2"),
         ("modernbert", "ModernBERT"),
+        ("moonshine", "Moonshine"),
         ("moshi", "Moshi"),
         ("mpnet", "MPNet"),
         ("mpt", "MPT"),
@@ -608,6 +619,7 @@
         ("table-transformer", "Table Transformer"),
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
+        ("textnet", "TextNet"),
         ("time_series_transformer", "Time Series Transformer"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),
@@ -638,6 +650,8 @@
         ("vit_msn", "ViTMSN"),
         ("vitdet", "VitDet"),
         ("vitmatte", "ViTMatte"),
+        ("vitpose", "VitPose"),
+        ("vitpose_backbone", "VitPoseBackbone"),
         ("vits", "VITS"),
         ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
@@ -1061,7 +1075,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 raise ValueError(
                     f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` "
                     "but Transformers does not recognize this architecture. This could be because of an "
-                    "issue with the checkpoint, or because your version of Transformers is out of date."
+                    "issue with the checkpoint, or because your version of Transformers is out of date.\n\n"
+                    "You can update Transformers with the command `pip install --upgrade transformers`. If this "
+                    "does not work, and the checkpoint is very new, then there may not be a release version "
+                    "that supports this model yet. In this case, you can get the most up-to-date code by installing "
+                    "Transformers from source with the command "
+                    "`pip install git+https://github.com/huggingface/transformers.git`"
                 )
             return config_class.from_dict(config_dict, **unused_kwargs)
         else:
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 0ddab5681f2e46..1b237caabaffd2 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -73,6 +73,7 @@
         ("mobilenet_v1", "MobileNetV1FeatureExtractor"),
         ("mobilenet_v2", "MobileNetV2FeatureExtractor"),
         ("mobilevit", "MobileViTFeatureExtractor"),
+        ("moonshine", "Wav2Vec2FeatureExtractor"),
         ("moshi", "EncodecFeatureExtractor"),
         ("nat", "ViTFeatureExtractor"),
         ("owlvit", "OwlViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 861754f591769b..53733c6a47c8d9 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -90,6 +90,7 @@
         ("deit", "DeiTModel"),
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
+        ("diffllama", "DiffLlamaModel"),
         ("dinat", "DinatModel"),
         ("dinov2", "Dinov2Model"),
         ("dinov2_with_registers", "Dinov2WithRegistersModel"),
@@ -131,6 +132,7 @@
         ("graphormer", "GraphormerModel"),
         ("grounding-dino", "GroundingDinoModel"),
         ("groupvit", "GroupViTModel"),
+        ("helium", "HeliumModel"),
         ("hiera", "HieraModel"),
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
@@ -178,6 +180,7 @@
         ("mobilevit", "MobileViTModel"),
         ("mobilevitv2", "MobileViTV2Model"),
         ("modernbert", "ModernBertModel"),
+        ("moonshine", "MoonshineModel"),
         ("moshi", "MoshiModel"),
         ("mpnet", "MPNetModel"),
         ("mpt", "MptModel"),
@@ -256,6 +259,7 @@
         ("t5", "T5Model"),
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
+        ("textnet", "TextNetModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
@@ -434,6 +438,7 @@
         ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
+        ("moonshine", "MoonshineForConditionalGeneration"),
         ("mpnet", "MPNetForMaskedLM"),
         ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),
@@ -493,7 +498,9 @@
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForCausalLM"),
         ("dbrx", "DbrxForCausalLM"),
+        ("diffllama", "DiffLlamaForCausalLM"),
         ("electra", "ElectraForCausalLM"),
+        ("emu3", "Emu3ForCausalLM"),
         ("ernie", "ErnieForCausalLM"),
         ("falcon", "FalconForCausalLM"),
         ("falcon_mamba", "FalconMambaForCausalLM"),
@@ -511,6 +518,7 @@
         ("gptj", "GPTJForCausalLM"),
         ("granite", "GraniteForCausalLM"),
         ("granitemoe", "GraniteMoeForCausalLM"),
+        ("helium", "HeliumForCausalLM"),
         ("jamba", "JambaForCausalLM"),
         ("jetmoe", "JetMoeForCausalLM"),
         ("llama", "LlamaForCausalLM"),
@@ -701,6 +709,7 @@
         ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
+        ("textnet", "TextNetForImageClassification"),
         ("timm_wrapper", "TimmWrapperForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
@@ -794,6 +803,7 @@
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
+        ("emu3", "Emu3ForConditionalGeneration"),
         ("fuyu", "FuyuForCausalLM"),
         ("git", "GitForCausalLM"),
         ("idefics", "IdeficsForVisionText2Text"),
@@ -933,6 +943,7 @@
 
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("moonshine", "MoonshineForConditionalGeneration"),
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),
         ("seamless_m4t_v2", "SeamlessM4Tv2ForSpeechToText"),
@@ -961,6 +972,7 @@
         ("data2vec-text", "Data2VecTextForSequenceClassification"),
         ("deberta", "DebertaForSequenceClassification"),
         ("deberta-v2", "DebertaV2ForSequenceClassification"),
+        ("diffllama", "DiffLlamaForSequenceClassification"),
         ("distilbert", "DistilBertForSequenceClassification"),
         ("electra", "ElectraForSequenceClassification"),
         ("ernie", "ErnieForSequenceClassification"),
@@ -979,6 +991,7 @@
         ("gpt_neo", "GPTNeoForSequenceClassification"),
         ("gpt_neox", "GPTNeoXForSequenceClassification"),
         ("gptj", "GPTJForSequenceClassification"),
+        ("helium", "HeliumForSequenceClassification"),
         ("ibert", "IBertForSequenceClassification"),
         ("jamba", "JambaForSequenceClassification"),
         ("jetmoe", "JetMoeForSequenceClassification"),
@@ -1056,6 +1069,7 @@
         ("data2vec-text", "Data2VecTextForQuestionAnswering"),
         ("deberta", "DebertaForQuestionAnswering"),
         ("deberta-v2", "DebertaV2ForQuestionAnswering"),
+        ("diffllama", "DiffLlamaForQuestionAnswering"),
         ("distilbert", "DistilBertForQuestionAnswering"),
         ("electra", "ElectraForQuestionAnswering"),
         ("ernie", "ErnieForQuestionAnswering"),
@@ -1153,6 +1167,7 @@
         ("data2vec-text", "Data2VecTextForTokenClassification"),
         ("deberta", "DebertaForTokenClassification"),
         ("deberta-v2", "DebertaV2ForTokenClassification"),
+        ("diffllama", "DiffLlamaForTokenClassification"),
         ("distilbert", "DistilBertForTokenClassification"),
         ("electra", "ElectraForTokenClassification"),
         ("ernie", "ErnieForTokenClassification"),
@@ -1170,6 +1185,7 @@
         ("gpt_bigcode", "GPTBigCodeForTokenClassification"),
         ("gpt_neo", "GPTNeoForTokenClassification"),
         ("gpt_neox", "GPTNeoXForTokenClassification"),
+        ("helium", "HeliumForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
         ("layoutlm", "LayoutLMForTokenClassification"),
         ("layoutlmv2", "LayoutLMv2ForTokenClassification"),
@@ -1386,8 +1402,10 @@
         ("rt_detr_resnet", "RTDetrResNetBackbone"),
         ("swin", "SwinBackbone"),
         ("swinv2", "Swinv2Backbone"),
+        ("textnet", "TextNetBackbone"),
         ("timm_backbone", "TimmBackbone"),
         ("vitdet", "VitDetBackbone"),
+        ("vitpose_backbone", "VitPoseBackbone"),
     ]
 )
 
@@ -1416,6 +1434,7 @@
         ("deberta-v2", "DebertaV2Model"),
         ("distilbert", "DistilBertModel"),
         ("electra", "ElectraModel"),
+        ("emu3", "Emu3TextModel"),
         ("flaubert", "FlaubertModel"),
         ("ibert", "IBertModel"),
         ("longformer", "LongformerModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 815e2ca755bee3..cf52e73f568aba 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -59,6 +59,7 @@
         ("clipseg", "CLIPSegProcessor"),
         ("clvp", "ClvpProcessor"),
         ("colpali", "ColPaliProcessor"),
+        ("emu3", "Emu3Processor"),
         ("flava", "FlavaProcessor"),
         ("fuyu", "FuyuProcessor"),
         ("git", "GitProcessor"),
@@ -81,6 +82,7 @@
         ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
         ("mllama", "MllamaProcessor"),
+        ("moonshine", "Wav2Vec2Processor"),
         ("oneformer", "OneFormerProcessor"),
         ("owlv2", "Owlv2Processor"),
         ("owlvit", "OwlViTProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 350c230f142c15..9ce9edd06cb270 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -170,6 +170,13 @@
                     "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "diffllama",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "dpr",
@@ -179,6 +186,7 @@
                 ),
             ),
             ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
+            ("emu3", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
             ("esm", ("EsmTokenizer", None)),
@@ -218,6 +226,7 @@
             ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
             ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
+            ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
             ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
@@ -314,6 +323,7 @@
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
             ("modernbert", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+            ("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
             ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index c89d8d7853008d..30017181738eee 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -120,13 +120,8 @@ def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=
 
 
 class BambaRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: BambaConfig,
-        device=None,
-    ):
+    def __init__(self, config: BambaConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -138,7 +133,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -150,13 +145,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 4586ab4797e5ec..f7c92b9d22c99f 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -16,7 +16,7 @@
 import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -157,14 +157,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index af76dd2e9656cb..ce75797f5ca68e 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -239,7 +239,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index c32bb934bdc528..0250bc3576e9e9 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -284,7 +284,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index f649246517d271..8667fe76349dfa 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -17,7 +17,7 @@
 import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -160,14 +160,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
         if tokenizer_component_instance:
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 6bb2dd23733ee3..0f7683d08d189e 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -257,7 +257,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 27dbbee6c671ee..154d6eb9f8225d 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -309,6 +309,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 539a3e365c9883..3ea3b76878da7c 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -302,9 +302,6 @@ def __init__(
         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
-        self.tie_word_embeddings = self.text_config.tie_word_embeddings
-        self.is_encoder_decoder = self.text_config.is_encoder_decoder
-
         self.num_query_tokens = num_query_tokens
         self.image_text_hidden_size = image_text_hidden_size
         self.image_token_index = image_token_index
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index a8b94e7c9709dc..517f9b2e4f31f0 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -491,7 +491,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py
index cadaeb2e09a624..4ef305c511e8df 100644
--- a/src/transformers/models/chameleon/image_processing_chameleon.py
+++ b/src/transformers/models/chameleon/image_processing_chameleon.py
@@ -302,7 +302,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 90a02dd5bb9fee..edbac91bb0609e 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -967,62 +967,6 @@ def forward(self, pixel_values: torch.LongTensor):
         return last_hidden_state
 
 
-CHAMELEON_VQ_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`ChameleonVQVAEConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    """The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens.
-    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
-    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
-    """,
-    CHAMELEON_VQ_START_DOCSTRING,
-)
-class ChameleonVQVAE(PreTrainedModel):
-    config_class = ChameleonVQVAEConfig
-    _no_split_modules = ["ChameleonVQVAEVectorQuantizer"]
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-        elif isinstance(module, nn.GroupNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-
-    def __init__(self, config: ChameleonVQVAEConfig):
-        super().__init__(config)
-
-        self.encoder = ChameleonVQVAEEncoder(config)
-        self.quantize = ChameleonVQVAEVectorQuantizer(config)
-        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
-        self.eval()  # Chameleon's VQ model is frozen
-
-    def encode(self, pixel_values: torch.LongTensor):
-        hidden_states = self.encoder(pixel_values)
-        hidden_states = self.quant_conv(hidden_states)
-        quant, emb_loss, indices = self.quantize(hidden_states)
-        return quant, emb_loss, indices
-
-
 class ChameleonImageVocabularyMapping:
     """
     A class for mapping discrete image tokens from VQGAN to BPE tokens.
@@ -1118,6 +1062,62 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+CHAMELEON_VQ_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ChameleonVQVAEConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
+    """,
+    CHAMELEON_VQ_START_DOCSTRING,
+)
+class ChameleonVQVAE(ChameleonPreTrainedModel):
+    config_class = ChameleonVQVAEConfig
+    _no_split_modules = ["ChameleonVQVAEVectorQuantizer"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.GroupNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__(config)
+
+        self.encoder = ChameleonVQVAEEncoder(config)
+        self.quantize = ChameleonVQVAEVectorQuantizer(config)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
+        self.eval()  # Chameleon's VQ model is frozen
+
+    def encode(self, pixel_values: torch.LongTensor):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quant, emb_loss, indices = self.quantize(hidden_states)
+        return quant, emb_loss, indices
+
+
 CHAMELEON_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1211,7 +1211,7 @@ def __init__(self, config: ChameleonConfig):
             [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.vqmodel = ChameleonVQVAE(config.vq_config)
+        self.vqmodel = ChameleonVQVAE._from_config(config.vq_config)
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 9f4bc2904c861c..99da53c6c612ff 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -62,6 +62,7 @@ class ChameleonProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    valid_kwargs = ["image_seq_length", "image_token"]
     image_processor_class = "ChameleonImageProcessor"
 
     def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 2c338f5a71b9db..e07c87dc3422e0 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -271,7 +271,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index a5d12bd7ba2987..c81451b195cbb5 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -309,7 +309,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 0bd9c9c0abce2f..01c8f4dcbc9a5b 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -277,6 +277,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
@@ -1426,7 +1433,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
     def __init__(self, config: CLIPTextConfig):
         super().__init__(config)
 
-        text_model = CLIPTextModel._from_config(config, attn_implementation=config._attn_implementation)
+        text_model = CLIPTextModel._from_config(config)
         self.text_model = text_model.text_model
 
         self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
@@ -1507,7 +1514,7 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
     def __init__(self, config: CLIPVisionConfig):
         super().__init__(config)
 
-        vision_model = CLIPVisionModel._from_config(config, attn_implementation=config._attn_implementation)
+        vision_model = CLIPVisionModel._from_config(config)
         self.vision_model = vision_model.vision_model
 
         self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index f3b963070dfb29..2d88746b771370 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -244,6 +244,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index fcfe1d2795b44a..86782cf807074f 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-import json
 import re
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
@@ -29,7 +28,6 @@
     if is_tf_available():
         import tensorflow as tf
 
-from tokenizers import pre_tokenizers
 
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -137,14 +135,6 @@ def __init__(
                 " so that the fast tokenizer works correctly."
             )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
     def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
         is_split_into_words = kwargs.get("is_split_into_words", False)
         assert self.add_prefix_space or not is_split_into_words, (
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index a035e77cf8f676..0d0d6ccf9e859c 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -1,3 +1,9 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere/modular_cohere.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Cohere team. All rights reserved.
 #
@@ -20,13 +26,10 @@
 
 # This file is based on the LLama model definition file in transformers
 
-"""PyTorch Cohere model."""
 
-import math
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
@@ -34,31 +37,21 @@
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
+    LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_cohere import CohereConfig
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "CohereConfig"
 
 
@@ -79,54 +72,21 @@ def forward(self, hidden_states):
         return hidden_states.to(input_dtype)
 
 
-ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
-
-
 class CohereRotaryEmbedding(nn.Module):
-    # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
-    # the same parameterization. The differences are highlighted with a comment.
-
-    def __init__(
-        self,
-        dim=None,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        rope_type="default",
-        config: Optional[CohereConfig] = None,
-    ):
+    def __init__(self, config: CohereConfig, device=None):
         super().__init__()
-        # TODO (joao): remove the `if` below, only used for BC
-        self.rope_kwargs = {}
-        if config is None:
-            logger.warning_once(
-                "`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.46"
-            )
-            self.rope_kwargs = {
-                "rope_type": rope_type,
-                "factor": scaling_factor,
-                "dim": dim,
-                "base": base,
-                "max_position_embeddings": max_position_embeddings,
-            }
-            self.rope_type = rope_type
-            self.max_seq_len_cached = max_position_embeddings
-            self.original_max_seq_len = max_position_embeddings
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
-            # BC: "rope_type" was originally "type"
-            if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-            else:
-                self.rope_type = "default"
-            self.max_seq_len_cached = config.max_position_embeddings
-            self.original_max_seq_len = config.max_position_embeddings
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -138,13 +98,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -161,7 +122,7 @@ def forward(self, x, position_ids):
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # This line differs from Llama's implementation
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
             cos = emb.cos()
             sin = emb.sin()
 
@@ -172,6 +133,60 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
+class CohereMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 def rotate_half(x):
     # Split and rotate. Note that this function is different from e.g. Llama.
     x1 = x[..., ::2]
@@ -210,36 +225,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
 
 
-class CohereMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    # Ignore copy
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
 class CohereAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -247,162 +232,57 @@ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.is_causal = True
-        self.use_qk_norm = config.use_qk_norm
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
 
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.use_qk_norm = config.use_qk_norm
         if self.use_qk_norm:
             # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
-            self.q_norm = CohereLayerNorm(hidden_size=(self.num_heads, self.head_dim), eps=config.layer_norm_eps)
-            self.k_norm = CohereLayerNorm(
-                hidden_size=(self.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
+            self.q_norm = CohereLayerNorm(
+                hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
             )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        if self.use_qk_norm:
-            query_states = self.q_norm(query_states)
-            key_states = self.k_norm(key_states)
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; position_ids needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
+            self.k_norm = CohereLayerNorm(
+                hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
             )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# NO LONGER EXIST Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
-# TODO cyril: modular
-class CohereFlashAttention2(CohereAttention):
-    """
-    Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
-                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
-            )
-        output_attentions = False
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
 
-        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        if self.use_qk_norm:
+        if self.use_qk_norm:  # main diff from Llama
             query_states = self.q_norm(query_states)
             key_states = self.k_norm(key_states)
 
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
 
         cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -412,169 +292,37 @@ def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (CohereLayerNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
             else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
+        attn_output, attn_weights = attention_interface(
+            self,
             query_states,
             key_states,
             value_states,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=self.is_causal,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class CohereSdpaAttention(CohereAttention):
-    """
-    Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `CohereAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "CohereModel is using CohereSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        if self.use_qk_norm:
-            query_states = self.q_norm(query_states)
-            key_states = self.k_norm(key_states)
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        # if attention_mask is not None and cache_position is not None:
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
         )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-COHERE_ATTENTION_CLASSES = {
-    "eager": CohereAttention,
-    "flash_attention_2": CohereFlashAttention2,
-    "sdpa": CohereSdpaAttention,
-}
+        return attn_output, attn_weights
 
 
 class CohereDecoderLayer(nn.Module):
     def __init__(self, config: CohereConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-
-        self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-
+        self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
         self.mlp = CohereMLP(config)
         self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
 
@@ -583,11 +331,12 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -595,13 +344,13 @@ def forward(
             attention_mask (`torch.FloatTensor`, *optional*):
                 attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                 query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
             position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
@@ -613,7 +362,7 @@ def forward(
         hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
-        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states_attention, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -622,6 +371,7 @@ def forward(
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
+            **kwargs,
         )
 
         # Fully Connected
@@ -631,19 +381,16 @@ def forward(
         hidden_states = residual + hidden_states_attention + hidden_states_mlp
 
         outputs = (hidden_states,)
-
         if output_attentions:
             outputs += (self_attn_weights,)
 
-        if use_cache:
-            outputs += (present_key_value,)
-
         return outputs
 
 
 COHERE_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.).
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
 
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
@@ -661,7 +408,6 @@ def forward(
     "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
     COHERE_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Cohere
 class CoherePreTrainedModel(PreTrainedModel):
     config_class = CohereConfig
     base_model_prefix = "model"
@@ -670,6 +416,7 @@ class CoherePreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -754,6 +501,10 @@ def _init_weights(self, module):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -761,8 +512,6 @@ def _init_weights(self, module):
     "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
     COHERE_START_DOCSTRING,
 )
-# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
-# TODO cyril: modular
 class CohereModel(CoherePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
@@ -771,7 +520,6 @@ class CohereModel(CoherePreTrainedModel):
         config: CohereConfig
     """
 
-    # Ignore copy
     def __init__(self, config: CohereConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
@@ -800,7 +548,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1023,11 +771,13 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
 class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
-    # Ignore copy
     def __init__(self, config):
         super().__init__(config)
         self.model = CohereModel(config)
@@ -1035,6 +785,7 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.logit_scale = config.logit_scale
         self.tie_word_embeddings = config.tie_word_embeddings
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1056,7 +807,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
-    # Ignore copy
     @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1064,7 +814,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1073,7 +823,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
+        **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1123,16 +873,17 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs,
         )
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
-        logits = logits * self.logit_scale
+        logits = logits * self.logit_scale  # main diff from Llama
 
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/src/transformers/models/cohere/modular_cohere.py b/src/transformers/models/cohere/modular_cohere.py
new file mode 100644
index 00000000000000..6ea8fd6c8356ba
--- /dev/null
+++ b/src/transformers/models/cohere/modular_cohere.py
@@ -0,0 +1,400 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on the LLama model definition file in transformers
+
+"""PyTorch Cohere model."""
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import LossKwargs, logging
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaModel,
+    LlamaRotaryEmbedding,
+    eager_attention_forward,
+)
+from .configuration_cohere import CohereConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "CohereConfig"
+
+
+class CohereLayerNorm(nn.Module):
+    def __init__(self, hidden_size=None, eps=1e-5, bias=False):
+        """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        mean = hidden_states.mean(-1, keepdim=True)
+        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+        hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = self.weight.to(torch.float32) * hidden_states
+        return hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
+
+
+class CohereRotaryEmbedding(LlamaRotaryEmbedding):
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    # Split and rotate. Note that this function is different from e.g. Llama.
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
+    return rot_x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    dtype = q.dtype
+    q = q.float()
+    k = k.float()
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
+
+
+class CohereMLP(LlamaMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class CohereAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
+            self.q_norm = CohereLayerNorm(
+                hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = CohereLayerNorm(
+                hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        if self.use_qk_norm:  # main diff from Llama
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class CohereDecoderLayer(nn.Module):
+    def __init__(self, config: CohereConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
+        self.mlp = CohereMLP(config)
+        self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states_attention, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        # Fully Connected
+        hidden_states_mlp = self.mlp(hidden_states)
+
+        # Add everything together
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class CohereModel(LlamaModel):
+    def __init__(self, config: CohereConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.rotary_emb = CohereRotaryEmbedding(config=config)
+        self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class CohereForCausalLM(LlamaForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = CohereModel(config)
+        self.logit_scale = config.logit_scale
+        self.tie_word_embeddings = config.tie_word_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >> from transformers import AutoTokenizer, CohereForCausalLM
+
+        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
+        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+
+        >> prompt = "Hey, are you conscious? Can you talk to me?"
+        >> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >> # Generate
+        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        logits = logits * self.logit_scale  # main diff from Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "CohereForCausalLM",
+    "CohereModel",
+    "CoherePreTrainedModel",  # noqa: F822
+]
diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py
index 1ffa4bffddc3df..9811c02163aa67 100644
--- a/src/transformers/models/cohere2/modeling_cohere2.py
+++ b/src/transformers/models/cohere2/modeling_cohere2.py
@@ -19,8 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -28,73 +27,40 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, HybridCache
 from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
 from ...utils import (
+    LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_cohere2 import Cohere2Config
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "Cohere2Config"
 
 
 class Cohere2RotaryEmbedding(nn.Module):
-    # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
-    # the same parameterization. The differences are highlighted with a comment.
-
-    def __init__(
-        self,
-        dim=None,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        rope_type="default",
-        config: Optional[Cohere2Config] = None,
-    ):
+    def __init__(self, config: Cohere2Config, device=None):
         super().__init__()
-        # TODO (joao): remove the `if` below, only used for BC
-        self.rope_kwargs = {}
-        if config is None:
-            logger.warning_once(
-                "`Cohere2RotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.46"
-            )
-            self.rope_kwargs = {
-                "rope_type": rope_type,
-                "factor": scaling_factor,
-                "dim": dim,
-                "base": base,
-                "max_position_embeddings": max_position_embeddings,
-            }
-            self.rope_type = rope_type
-            self.max_seq_len_cached = max_position_embeddings
-            self.original_max_seq_len = max_position_embeddings
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
-            # BC: "rope_type" was originally "type"
-            if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-            else:
-                self.rope_type = "default"
-            self.max_seq_len_cached = config.max_position_embeddings
-            self.original_max_seq_len = config.max_position_embeddings
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -106,13 +72,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -129,7 +96,7 @@ def forward(self, x, position_ids):
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # This line differs from Llama's implementation
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
             cos = emb.cos()
             sin = emb.sin()
 
@@ -157,6 +124,44 @@ def forward(self, hidden_states):
         return hidden_states.to(input_dtype)
 
 
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 def rotate_half(x):
     # Split and rotate. Note that this function is different from e.g. Llama.
     x1 = x[..., ::2]
@@ -195,132 +200,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
 
 
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    **_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    key_states = repeat_kv(key, config.num_key_value_groups)
-    value_states = repeat_kv(value, config.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) / math.sqrt(config.head_dim)
-
-    if mask is not None:  # no matter the length, we just slice it
-        causal_mask = mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-
-
-def flash_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    target_dtype: torch.dtype = torch.float16,
-    **_kwargs,
-) -> Tuple[torch.Tensor, None]:
-    if mask is not None:
-        seq_len = mask.shape[1]
-        query = query[:, :, :seq_len]
-        value = value[:, :, :seq_len]
-
-    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
-    query_states = query.transpose(1, 2)
-    key_states = key.transpose(1, 2)
-    value_states = value.transpose(1, 2)
-
-    dropout_rate = config.attention_dropout if config.training else 0.0
-
-    input_dtype = query_states.dtype
-    if input_dtype == torch.float32:
-        query_states = query_states.to(target_dtype)
-        key_states = key_states.to(target_dtype)
-        value_states = value_states.to(target_dtype)
-
-    attn_output = _flash_attention_forward(
-        query_states,
-        key_states,
-        value_states,
-        mask,
-        seq_len,
-        dropout=dropout_rate,
-        is_causal=config.is_causal,
-        sliding_window=config.sliding_window,
-        use_top_left_mask=config._flash_attn_uses_top_left_mask,
-    )
-
-    return attn_output, None
-
-
-def sdpa_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    **_kwargs,
-) -> Tuple[torch.Tensor, None]:
-    key = repeat_kv(key, config.num_key_value_groups)
-    value = repeat_kv(value, config.num_key_value_groups)
-
-    causal_mask = mask
-    if mask is not None:
-        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
-
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query.device.type == "cuda" and causal_mask is not None:
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-
-    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=causal_mask,
-        dropout_p=config.attention_dropout if config.training else 0.0,
-        is_causal=is_causal,
-    )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, None
-
-
-COHERE2_ATTENTION_FUNCTION = {
-    "flash_attention_2": flash_attention_forward,
-    "eager": eager_attention_forward,
-    "sdpa": sdpa_attention_forward,
-}
-
-
 class Cohere2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -328,34 +207,24 @@ def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = config.head_dim
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.is_causal = True
 
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
-
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
         self.sliding_window = (
             config.sliding_window if (self.layer_idx + 1) % self.config.sliding_window_pattern != 0 else None
         )
@@ -364,25 +233,19 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: Tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
         cos, sin = position_embeddings
-
         if self.sliding_window is not None:
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -395,23 +258,31 @@ def forward(
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
-            logger.warning_once("Setting `attention_type` to `eager` because `output_attentions=True`")
-            attention_type = "eager"
-        else:
-            attention_type = self.config._attn_implementation
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        attn_output, attn_weights = COHERE2_ATTENTION_FUNCTION[attention_type](
-            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights
 
 
 class Cohere2MLP(nn.Module):
@@ -425,7 +296,6 @@ def __init__(self, config):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
 
-    # Ignore copy
     def forward(self, x):
         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
@@ -436,7 +306,6 @@ def __init__(self, config: Cohere2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = Cohere2Attention(config, layer_idx)
-
         self.mlp = Cohere2MLP(config)
         self.input_layernorm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
         self.config = config
@@ -448,10 +317,11 @@ def forward(
         hidden_states: torch.Tensor,
         position_embeddings: Tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -462,13 +332,13 @@ def forward(
             attention_mask (`torch.FloatTensor`, *optional*):
                 attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                 query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
         """
@@ -492,7 +362,7 @@ def forward(
         hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
-        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states_attention, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -500,6 +370,7 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
+            **kwargs,
         )
 
         # Fully Connected
@@ -513,15 +384,13 @@ def forward(
         if output_attentions:
             outputs += (self_attn_weights,)
 
-        if use_cache:
-            outputs += (present_key_value,)
-
         return outputs
 
 
 COHERE2_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.).
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
 
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
@@ -547,6 +416,7 @@ class Cohere2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -684,6 +554,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -758,6 +629,7 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
@@ -771,16 +643,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = past_key_values if use_cache else None
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
+        output = BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
+            past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
+        return output if return_dict else output.to_tuple()
 
     @torch.no_grad()
     def _update_causal_mask(
@@ -874,11 +743,13 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere2
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
 class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
-    # Ignore copy
     def __init__(self, config: Cohere2Config):
         super().__init__(config)
         self.model = Cohere2Model(config)
@@ -886,6 +757,7 @@ def __init__(self, config: Cohere2Config):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.logit_scale = config.logit_scale
         self.tie_word_embeddings = config.tie_word_embeddings
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -907,7 +779,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
-    # Ignore copy
     @add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -915,7 +786,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -924,7 +795,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
+        **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -974,16 +845,17 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs,
         )
 
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
-        logits = logits * self.logit_scale
+        logits = logits * self.logit_scale  # main diff from Llama
 
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
index 3e6999b29bbfa1..145905287acde4 100644
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-from typing import Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -22,30 +21,29 @@
 
 from ...cache_utils import Cache, HybridCache
 from ...configuration_utils import PretrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
 )
 from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
 from ...utils import (
-    is_flash_attn_2_available,
     logging,
 )
 from ..cohere.modeling_cohere import (
+    CohereAttention,
     CohereDecoderLayer,
     CohereForCausalLM,
     CohereLayerNorm,
     CoherePreTrainedModel,
     CohereRotaryEmbedding,
     apply_rotary_pos_emb,
-    repeat_kv,
+    eager_attention_forward,
 )
 from ..gemma2.modeling_gemma2 import Gemma2Model
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
 
 
@@ -240,155 +238,31 @@ class Cohere2LayerNorm(CohereLayerNorm):
     pass
 
 
-def eager_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    **_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    key_states = repeat_kv(key, config.num_key_value_groups)
-    value_states = repeat_kv(value, config.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) / math.sqrt(config.head_dim)
-
-    if mask is not None:  # no matter the length, we just slice it
-        causal_mask = mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-
-
-def flash_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    target_dtype: torch.dtype = torch.float16,
-    **_kwargs,
-) -> Tuple[torch.Tensor, None]:
-    if mask is not None:
-        seq_len = mask.shape[1]
-        query = query[:, :, :seq_len]
-        value = value[:, :, :seq_len]
-
-    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
-    query_states = query.transpose(1, 2)
-    key_states = key.transpose(1, 2)
-    value_states = value.transpose(1, 2)
-
-    dropout_rate = config.attention_dropout if config.training else 0.0
-
-    input_dtype = query_states.dtype
-    if input_dtype == torch.float32:
-        query_states = query_states.to(target_dtype)
-        key_states = key_states.to(target_dtype)
-        value_states = value_states.to(target_dtype)
-
-    attn_output = _flash_attention_forward(
-        query_states,
-        key_states,
-        value_states,
-        mask,
-        seq_len,
-        dropout=dropout_rate,
-        is_causal=config.is_causal,
-        sliding_window=config.sliding_window,
-        use_top_left_mask=config._flash_attn_uses_top_left_mask,
-    )
-
-    return attn_output, None
-
-
-def sdpa_attention_forward(
-    config: Cohere2Config,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    mask: Optional[torch.Tensor],
-    **_kwargs,
-) -> Tuple[torch.Tensor, None]:
-    key = repeat_kv(key, config.num_key_value_groups)
-    value = repeat_kv(value, config.num_key_value_groups)
-
-    causal_mask = mask
-    if mask is not None:
-        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
-
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query.device.type == "cuda" and causal_mask is not None:
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-
-    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=causal_mask,
-        dropout_p=config.attention_dropout if config.training else 0.0,
-        is_causal=is_causal,
-    )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, None
-
-
-COHERE2_ATTENTION_FUNCTION = {
-    "flash_attention_2": flash_attention_forward,
-    "eager": eager_attention_forward,
-    "sdpa": sdpa_attention_forward,
-}
-
-
-class Cohere2Attention(nn.Module):
+class Cohere2Attention(CohereAttention, nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None):
-        super().__init__()
+        nn.Module.__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = config.head_dim
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.is_causal = True
 
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
-
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
         self.sliding_window = (
             config.sliding_window if (self.layer_idx + 1) % self.config.sliding_window_pattern != 0 else None
         )
@@ -397,25 +271,19 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: Tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
         cos, sin = position_embeddings
-
         if self.sliding_window is not None:
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -428,23 +296,31 @@ def forward(
             }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
-            logger.warning_once("Setting `attention_type` to `eager` because `output_attentions=True`")
-            attention_type = "eager"
-        else:
-            attention_type = self.config._attn_implementation
-
-        attn_output, attn_weights = COHERE2_ATTENTION_FUNCTION[attention_type](
-            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights
 
 
 class Cohere2DecoderLayer(CohereDecoderLayer):
@@ -460,10 +336,11 @@ def forward(
         hidden_states: torch.Tensor,
         position_embeddings: Tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -474,13 +351,13 @@ def forward(
             attention_mask (`torch.FloatTensor`, *optional*):
                 attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                 query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
         """
@@ -504,7 +381,7 @@ def forward(
         hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
-        hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states_attention, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -512,6 +389,7 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
+            **kwargs,
         )
 
         # Fully Connected
@@ -525,9 +403,6 @@ def forward(
         if output_attentions:
             outputs += (self_attn_weights,)
 
-        if use_cache:
-            outputs += (present_key_value,)
-
         return outputs
 
 
@@ -559,6 +434,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -633,6 +509,7 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
@@ -646,16 +523,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = past_key_values if use_cache else None
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
+        output = BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
+            past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
+        return output if return_dict else output.to_tuple()
 
 
 class Cohere2ForCausalLM(CohereForCausalLM):
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index effb8c3b058168..cecaf4e6a38586 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -1434,7 +1434,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 90fc0bb1ff179b..35cbc91797add3 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -282,7 +282,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index 39c64d90e533a2..368f29b522bc48 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -14,11 +14,8 @@
 # limitations under the License.
 """Fast Tokenization class for model DeBERTa."""
 
-import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers
-
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
@@ -132,14 +129,6 @@ def __init__(
         )
         self.add_bos_token = kwargs.pop("add_bos_token", False)
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
     @property
     def mask_token(self) -> str:
         """
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 0b6a58eb8f1351..5ea9268503825c 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -1432,7 +1432,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
index 0a2fbc14ee94f8..565da27ded7c63 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Image processor class for Deformable DETR."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deformable_detr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import functools
 import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -26,10 +16,7 @@
     get_max_height_width,
     safe_squeeze,
 )
-from ...image_transforms import (
-    center_to_corners_format,
-    corners_to_center_format,
-)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -43,7 +30,6 @@
     get_image_type,
     infer_channel_dimension_format,
     make_list_of_images,
-    pil_torch_interpolation_mapping,
     validate_annotations,
     validate_kwargs,
 )
@@ -55,24 +41,22 @@
     is_vision_available,
     logging,
 )
-from .image_processing_deformable_detr import (
-    get_size_with_aspect_ratio,
-)
+from .image_processing_deformable_detr import get_size_with_aspect_ratio
 
 
 if is_torch_available():
     import torch
 
-if is_torchvision_available():
-    from torchvision.io import read_image
+if is_vision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_vision_available():
-        from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_torchvision_v2_available():
-        from torchvision.transforms.v2 import functional as F
-    else:
-        from torchvision.transforms import functional as F
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
 
 
 logger = logging.get_logger(__name__)
@@ -80,7 +64,7 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
     """
     Convert a COCO polygon annotation to a mask.
@@ -115,7 +99,7 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: to
     return masks
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50
 def prepare_coco_detection_annotation(
     image,
     target,
@@ -123,7 +107,7 @@ def prepare_coco_detection_annotation(
     input_data_format: Optional[Union[ChannelDimension, str]] = None,
 ):
     """
-    Convert the target in COCO format into the format expected by DeformableDetr.
+    Convert the target in COCO format into the format expected by DEFORMABLE_DETR.
     """
     image_height, image_width = image.size()[-2:]
 
@@ -180,7 +164,6 @@ def prepare_coco_detection_annotation(
     return new_target
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes
 def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.
@@ -215,7 +198,9 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
     return torch.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
 def rgb_to_id(color):
     """
     Converts RGB color to unique ID.
@@ -227,7 +212,6 @@ def rgb_to_id(color):
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr
 def prepare_coco_panoptic_annotation(
     image: torch.Tensor,
     target: Dict,
@@ -236,7 +220,7 @@ def prepare_coco_panoptic_annotation(
     input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
-    Prepare a coco panoptic annotation for DeformableDetr.
+    Prepare a coco panoptic annotation for DEFORMABLE_DETR.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
@@ -279,13 +263,13 @@ def prepare_coco_panoptic_annotation(
 
 class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
     r"""
-    Constructs a fast Deformable DETR image processor.
+    Constructs a fast DeformableDetr image processor.
 
     Args:
         format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
             overridden by the `do_resize` parameter in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
             Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
@@ -316,7 +300,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
             Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
             for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_annotations (`bool`, *optional*, defaults to `True`):
-            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
             bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
             Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
         do_pad (`bool`, *optional*, defaults to `True`):
@@ -332,7 +316,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__
     def __init__(
         self,
         format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
@@ -404,7 +387,6 @@ def __init__(
         ]
 
     @classmethod
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
@@ -418,7 +400,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr
     def prepare_annotation(
         self,
         image: torch.Tensor,
@@ -429,7 +410,7 @@ def prepare_annotation(
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
-        Prepare an annotation for feeding into DeformableDetr model.
+        Prepare an annotation for feeding into DEFORMABLE_DETR model.
         """
         format = format if format is not None else self.format
 
@@ -451,7 +432,6 @@ def prepare_annotation(
             raise ValueError(f"Format {format} is not supported.")
         return target
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
     def resize(
         self,
         image: torch.Tensor,
@@ -506,7 +486,6 @@ def resize(
         )
         return image
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
     def resize_annotation(
         self,
         annotation: Dict[str, Any],
@@ -560,7 +539,6 @@ def resize_annotation(
 
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         image_height, image_width = image_size
         norm_annotation = {}
@@ -576,7 +554,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
                 norm_annotation[key] = value
         return norm_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
     def _update_annotation_for_padded_image(
         self,
         annotation: Dict,
@@ -612,7 +589,6 @@ def _update_annotation_for_padded_image(
                 new_annotation[key] = value
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
     def pad(
         self,
         image: torch.Tensor,
@@ -644,7 +620,6 @@ def pad(
         return image, pixel_mask, annotation
 
     @functools.lru_cache(maxsize=1)
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
     def _validate_input_arguments(
         self,
         do_rescale: bool,
@@ -673,7 +648,6 @@ def _validate_input_arguments(
         if do_normalize and None in (image_mean, image_std):
             raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess
     def preprocess(
         self,
         images: ImageInput,
@@ -874,7 +848,7 @@ def preprocess(
         processed_annotations = []
         pixel_masks = []  # Initialize pixel_masks here
         for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
-            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            # prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image)
             if annotations is not None:
                 annotation = self.prepare_annotation(
                     image,
@@ -950,7 +924,6 @@ def preprocess(
             ]
         return encoded_inputs
 
-    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process
     def post_process(self, outputs, target_sizes):
         """
         Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -996,7 +969,6 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
-    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):
diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py
new file mode 100644
index 00000000000000..5dfb6048c3431b
--- /dev/null
+++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py
@@ -0,0 +1,144 @@
+from typing import List, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_transforms import center_to_corners_format
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeformableDetrImageProcessorFast(DetrImageProcessorFast):
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DeformableDetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    def post_process_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance():
+        raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic():
+        raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_semantic_segmentation():
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic_segmentation():
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+
+__all__ = ["DeformableDetrImageProcessorFast"]
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 7b5f981b004984..770a1b2ec8e693 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -260,7 +260,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deprecated/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py
index a548590ce12cd5..e59b7bd95bfb48 100644
--- a/src/transformers/models/deprecated/deta/image_processing_deta.py
+++ b/src/transformers/models/deprecated/deta/image_processing_deta.py
@@ -1049,7 +1049,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
index 15fdf04051c1fb..d1503f661d57dc 100644
--- a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@@ -280,7 +280,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
index 009f8307d47577..46c6b0c7ca36c6 100644
--- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -251,7 +251,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
index e7c3193ceab4cb..0424fd058e5477 100644
--- a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
@@ -302,7 +302,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 85fbe698d10a12..03c7540420686e 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -1407,7 +1407,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index f010ffe272294f..7cf567c99a27c3 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -72,17 +72,15 @@
 if is_vision_available():
     import PIL
 
+    from ...image_utils import pil_torch_interpolation_mapping
 
-if is_torchvision_available():
-    from torchvision.io import read_image
-
-    if is_vision_available():
-        from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_torchvision_v2_available():
-        from torchvision.transforms.v2 import functional as F
-    else:
-        from torchvision.transforms import functional as F
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/diffllama/__init__.py b/src/transformers/models/diffllama/__init__.py
new file mode 100644
index 00000000000000..c162fce0a48bd1
--- /dev/null
+++ b/src/transformers/models/diffllama/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_diffllama import *
+    from .modeling_diffllama import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py
new file mode 100644
index 00000000000000..1b38f55d3903f8
--- /dev/null
+++ b/src/transformers/models/diffllama/configuration_diffllama.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffLlama model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DiffLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults
+    will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiffLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 16):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'diffllama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        lambda_std_dev (`float`, *optional*, defaults to 0.1):
+            The standard deviation for initialization of parameter lambda in attention layer.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+
+    ```python
+    >>> from transformers import DiffLlamaModel, DiffLlamaConfig
+
+    >>> # Initializing a DiffLlama diffllama-7b style configuration
+    >>> configuration = DiffLlamaConfig()
+
+    >>> # Initializing a model from the diffllama-7b style configuration
+    >>> model = DiffLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "diffllama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=16,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        lambda_std_dev=0.1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.lambda_std_dev = lambda_std_dev
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["DiffLlamaConfig"]
diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py
new file mode 100644
index 00000000000000..fc1e38549b7e1a
--- /dev/null
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@@ -0,0 +1,1423 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/diffllama/modular_diffllama.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_diffllama.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
+_CONFIG_FOR_DOC = "DiffLlamaConfig"
+
+
+class DiffLlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DiffLlamaModel is using DiffLlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None
+
+
+class DiffLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DiffLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(nn.Module):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = DiffLlamaMLP(config)
+        self.input_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+DIFFLLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DiffLlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare DiffLlama Model outputting raw hidden-states without any specific head on top.",
+    DIFFLLAMA_START_DOCSTRING,
+)
+class DiffLlamaPreTrainedModel(PreTrainedModel):
+    config_class = DiffLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DiffLlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class DiffLlamaRotaryEmbedding(nn.Module):
+    def __init__(self, config: DiffLlamaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+DIFFLLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare DiffLlama Model outputting raw hidden-states without any specific head on top.",
+    DIFFLLAMA_START_DOCSTRING,
+)
+class DiffLlamaModel(DiffLlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DiffLlamaDecoderLayer`]
+
+    Args:
+        config: DiffLlamaConfig
+    """
+
+    def __init__(self, config: DiffLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DiffLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DiffLlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DiffLlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DiffLlamaForCausalLM
+
+        >>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The DiffLlama Model transformer with a sequence classification head on top (linear layer).
+
+    [`DiffLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    DIFFLLAMA_START_DOCSTRING,
+)
+class DiffLlamaForSequenceClassification(DiffLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = DiffLlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+The DiffLlama Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DIFFLLAMA_START_DOCSTRING,
+)
+class DiffLlamaForQuestionAnswering(DiffLlamaPreTrainedModel):
+    base_model_prefix = "transformer"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = DiffLlamaModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The DiffLlama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    DIFFLLAMA_START_DOCSTRING,
+)
+class DiffLlamaForTokenClassification(DiffLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = DiffLlamaModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/src/transformers/models/diffllama/modular_diffllama.py b/src/transformers/models/diffllama/modular_diffllama.py
new file mode 100644
index 00000000000000..5ec3f75f6e3788
--- /dev/null
+++ b/src/transformers/models/diffllama/modular_diffllama.py
@@ -0,0 +1,464 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import Cache, StaticCache
+from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...utils import (
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from ..gemma.modeling_gemma import GemmaForCausalLM
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from ..mistral.modeling_mistral import MistralMLP
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
+_CONFIG_FOR_DOC = "DiffLlamaConfig"
+
+
+class DiffLlamaMLP(MistralMLP):
+    pass
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DiffLlamaModel is using DiffLlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+
+class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
+    pass
+
+
+class DiffLlamaModel(LlamaModel):
+    pass
+
+
+class DiffLlamaForCausalLM(GemmaForCausalLM):
+    pass
+
+
+class DiffLlamaForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class DiffLlamaForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+class DiffLlamaForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",  # noqa: F822
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
index 80c095cb464838..a0295899fcd72f 100644
--- a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
@@ -19,6 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from ...configuration_utils import PretrainedConfig
 from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
@@ -69,10 +70,6 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to use the SwiGLU feedforward neural network.
         num_register_tokens (`int`, *optional*, defaults to 4):
             Number of register tokens to use.
-        interpolate_antialias (`bool`, *optional*, defaults to `True`):
-            Whether to use antialiasing when interpolating the image patches.
-        interpolate_offset (`float`, *optional*, defaults to 0.0):
-            Offset to use when interpolating the image patches.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
             (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
@@ -105,7 +102,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "dinov2-with-registers-base"
+    model_type = "dinov2_with_registers"
 
     def __init__(
         self,
@@ -126,8 +123,6 @@ def __init__(
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
         num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
         out_features=None,
         out_indices=None,
         apply_layernorm=True,
@@ -153,8 +148,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
         self.num_register_tokens = num_register_tokens
-        self.interpolate_antialias = interpolate_antialias
-        self.interpolate_offset = interpolate_offset
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
             out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
index 4ebefa8bded12b..bd9d181cdf35c1 100644
--- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -19,6 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import collections.abc
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
@@ -37,6 +38,7 @@
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
+    torch_int,
 )
 from ...utils.backbone_utils import BackboneMixin
 from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig
@@ -99,43 +101,61 @@ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
         self.config = config
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
 
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
         """
-
         num_patches = embeddings.shape[1] - 1
         num_positions = self.position_embeddings.shape[1] - 1
-        if num_patches == num_positions and height == width:
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
             return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
         class_pos_embed = self.position_embeddings[:, 0]
         patch_pos_embed = self.position_embeddings[:, 1:]
         dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
         height = height // self.config.patch_size
         width = width // self.config.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        height, width = height + self.config.interpolate_offset, width + self.config.interpolate_offset
-        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
         target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed.to(dtype=torch.float32),
-            scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
             mode="bicubic",
             align_corners=False,
-            antialias=self.config.interpolate_antialias,
-        )
-        patch_pos_embed = patch_pos_embed.to(dtype=target_dtype)
-        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
-            raise ValueError("Width or height does not match with the interpolated position embeddings")
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
diff --git a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
index bbfacd2b5f571d..cbd316c421b036 100644
--- a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
+
 from typing import Optional
 
 import torch
@@ -30,7 +30,7 @@
 )
 from ...configuration_utils import PretrainedConfig
 from ...modeling_outputs import BackboneOutput
-from ...utils import logging
+from ...utils import logging, torch_int
 from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
@@ -83,10 +83,6 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to use the SwiGLU feedforward neural network.
         num_register_tokens (`int`, *optional*, defaults to 4):
             Number of register tokens to use.
-        interpolate_antialias (`bool`, *optional*, defaults to `True`):
-            Whether to use antialiasing when interpolating the image patches.
-        interpolate_offset (`float`, *optional*, defaults to 0.0):
-            Offset to use when interpolating the image patches.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
             (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
@@ -119,7 +115,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "dinov2-with-registers-base"
+    model_type = "dinov2_with_registers"
 
     def __init__(
         self,
@@ -140,8 +136,6 @@ def __init__(
         drop_path_rate=0.0,
         use_swiglu_ffn=False,
         num_register_tokens=4,
-        interpolate_antialias=True,
-        interpolate_offset=0.0,
         out_features=None,
         out_indices=None,
         apply_layernorm=True,
@@ -167,8 +161,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.use_swiglu_ffn = use_swiglu_ffn
         self.num_register_tokens = num_register_tokens
-        self.interpolate_antialias = interpolate_antialias
-        self.interpolate_offset = interpolate_offset
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
         self._out_features, self._out_indices = get_aligned_output_features_output_indices(
             out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
@@ -196,43 +188,61 @@ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
         self.config = config
 
     def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """
         This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
 
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
         """
-
         num_patches = embeddings.shape[1] - 1
         num_positions = self.position_embeddings.shape[1] - 1
-        if num_patches == num_positions and height == width:
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
             return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
         class_pos_embed = self.position_embeddings[:, 0]
         patch_pos_embed = self.position_embeddings[:, 1:]
         dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
         height = height // self.config.patch_size
         width = width // self.config.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        height, width = height + self.config.interpolate_offset, width + self.config.interpolate_offset
-        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
         target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed.to(dtype=torch.float32),
-            scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
             mode="bicubic",
             align_corners=False,
-            antialias=self.config.interpolate_antialias,
-        )
-        patch_pos_embed = patch_pos_embed.to(dtype=target_dtype)
-        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
-            raise ValueError("Width or height does not match with the interpolated position embeddings")
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
         return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 0ddd0591ca371d..a10e2846cb767d 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -409,7 +409,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index c4d79696f0ab62..c49dfcfef890dd 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -380,7 +380,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 3e2403716ce0af..79f92ec1cae6f8 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -317,7 +317,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/emu3/__init__.py b/src/transformers/models/emu3/__init__.py
new file mode 100644
index 00000000000000..d8555f58d18664
--- /dev/null
+++ b/src/transformers/models/emu3/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_emu3 import *
+    from .image_processing_emu3 import *
+    from .modeling_emu3 import *
+    from .processing_emu3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py
new file mode 100644
index 00000000000000..5b5abedf4016d5
--- /dev/null
+++ b/src/transformers/models/emu3/configuration_emu3.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Emu3VQVAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Emu3VQVAE`]. It is used to instantiate an VQ-VAE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in Emu3 paper.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_latent (`bool`, *optional*, defaults to `False`):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        temporal_downsample_factor (`int`, *optional*, defaults to 4):
+            Temporal downsample factor.
+        base_channels (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        channel_multiplier (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`List[int]`, *optional*, defaults to `[3]`):
+            Stage indices to apply attention.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations in the attention layer.
+        num_attention_heads (`int`, *optional*, defaults to 1):
+            Number of attention heads for each attention layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Emu3VQVAE, Emu3VQVAEConfig
+
+    >>> # Initializing a video VQ model of Emu3 configuration
+    >>> configuration = Emu3VQVAEConfig()
+
+    >>> # Initializing a model from the Emu3 VQ model style configuration
+    >>> model = Emu3VQVAE(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "emu3_vqgan"
+    base_config_key = "vq_config"
+
+    def __init__(
+        self,
+        codebook_size: int = 32768,
+        embed_dim: int = 4,
+        latent_channels: int = 4,
+        double_latent: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        temporal_downsample_factor: int = 4,
+        base_channels: int = 256,
+        channel_multiplier: List[int] = [1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: List[int] = [3],
+        hidden_size: int = 1024,
+        num_attention_heads: int = 1,
+        attention_dropout: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.codebook_size = codebook_size
+        self.embed_dim = embed_dim
+        self.latent_channels = latent_channels
+        self.double_latent = double_latent
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.temporal_downsample_factor = temporal_downsample_factor
+        self.base_channels = base_channels
+        self.channel_multiplier = channel_multiplier
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+
+
+class Emu3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Emu3TextModel`]. It is used to instantiate a
+    emu3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Emu3-community/Emu3-Chat-hf](https://huggingface.co/Emu3-community/Emu3-Chat-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 184622):
+            Vocabulary size of the Emu3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Emu3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 9216):
+            The maximum sequence length that this model might ever be used with. Emu supports up to 9216 tokens,
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 151643):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 151849):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 151850):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    ```python
+    >>> from transformers import Emu3Model, Emu3Config
+
+    >>> # Initializing a Emu3-community/Emu3-Chat-hf style configuration
+    >>> configuration = Emu3Config()
+
+    >>> # Initializing a model from the Emu3-community/Emu3-Chat-hf style configuration
+    >>> model = Emu3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "emu3_text_model"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 184622,
+        hidden_size: int = 4096,
+        intermediate_size: int = 14336,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: Optional[int] = 8,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 9216,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        pad_token_id: int = 151643,
+        bos_token_id: int = 151849,
+        eos_token_id: int = 151850,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 1000000.0,
+        rope_scaling: Optional = None,
+        mlp_bias=False,
+        attention_bias=False,
+        attention_dropout: float = 0.1,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.mlp_bias = mlp_bias
+        self.attention_bias = attention_bias
+        self.initializer_range = initializer_range
+        rope_config_validation(self)
+
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Emu3Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Emu3Model`]. It is used to instantiate a
+    emu3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Emu3-community/Emu3-Chat-hf](https://huggingface.co/Emu3-community/Emu3-Chat-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vq_config (`Union[Dict, Emu3VQVAEConfig]`, *optional*):
+            Emu3VQVAEConfig instance containing the configuration for the VQ-VAE model.
+        text_config (`Union[Dict, Emu3TextConfig]``, *optional*):
+            Emu3TextConfig instance containing the configuration for the language model.
+        vocabulary_map (`dict`, *optional*):
+            A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs.
+    """
+
+    model_type = "emu3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"text_config": Emu3TextConfig, "vq_config": Emu3VQVAEConfig}
+
+    def __init__(
+        self,
+        vq_config: Union[Dict, Emu3VQVAEConfig] = None,
+        text_config: Union[Dict, Emu3TextConfig] = None,
+        vocabulary_map: Dict[int, int] = None,
+        **kwargs,
+    ):
+        if vq_config is None:
+            vq_config = Emu3VQVAEConfig()
+        elif isinstance(vq_config, dict):
+            vq_config = Emu3VQVAEConfig(**vq_config)
+
+        if text_config is None:
+            text_config = Emu3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Emu3TextConfig(**text_config)
+
+        self.vq_config = vq_config
+        self.text_config = text_config
+        self.vocabulary_map = vocabulary_map
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Emu3Config", "Emu3TextConfig", "Emu3VQVAEConfig"]
diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
new file mode 100644
index 00000000000000..8ac8db7e429031
--- /dev/null
+++ b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
@@ -0,0 +1,448 @@
+# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import re
+from typing import Dict, Optional
+
+import requests
+import torch
+from accelerate import init_empty_weights
+from PIL import Image
+
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Emu3Config,
+    Emu3ForConditionalGeneration,
+    Emu3ImageProcessor,
+    Emu3Processor,
+    Emu3TextConfig,
+    GenerationConfig,
+)
+from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \
+    --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import Emu3ForConditionalGeneration, Emu3Processor
+
+model = Emu3ForConditionalGeneration.from_pretrained("/output/path")
+processor = Emu3Processor.from_pretrained("/output/path")
+```
+
+"""
+
+
+byte_encoder = bytes_to_unicode()
+CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
+
+
+# Tiktoken to HF conversion, thanks for Xenova
+def token_bytes_to_string(b):
+    return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+
+# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
+def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None):
+    parts = [bytes([b]) for b in token]
+    while True:
+        min_idx = None
+        min_rank = None
+        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+            rank = mergeable_ranks.get(pair[0] + pair[1])
+            if rank is not None and (min_rank is None or rank < min_rank):
+                min_idx = i
+                min_rank = rank
+        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+            break
+        assert min_idx is not None
+        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
+    return parts
+
+
+def generate_vocab_and_merges(encoder):
+    mergeable_ranks = encoder._mergeable_ranks
+
+    merges = []
+    vocab = {}
+    for token, rank in mergeable_ranks.items():
+        vocab[token_bytes_to_string(token)] = rank
+
+        if len(token) == 1:
+            continue
+        merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))
+        assert len(merged) == 2
+        merges.append(" ".join(map(token_bytes_to_string, merged)))
+
+    # Also add special tokens
+    vocab.update(encoder._special_tokens)
+    return vocab, merges
+
+
+def convert_tiktoken(tokenizer, output_dir):
+    encoder = tokenizer.tokenizer
+    vocab, merges = generate_vocab_and_merges(encoder)
+    added_tokens = [
+        {
+            "id": id,
+            "content": content,
+            "single_word": False,
+            "lstrip": False,
+            "rstrip": False,
+            "normalized": False,
+            "special": True,
+        }
+        for content, id in encoder._special_tokens.items()
+        if content != "<|extra_0|>"
+    ]
+
+    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json
+    tokenizer_config_template = {
+        "add_prefix_space": False,
+        "bos_token": "<|extra_203|>",
+        "clean_up_tokenization_spaces": False,
+        "eos_token": "<|extra_204|>",
+        "pad_token": "<|endoftext|>",
+    }
+    tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"})
+    tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))
+
+    # add placeholder image token by taking one of the reserved tokens
+    reserved_token_id = vocab["<|extra_0|>"]
+    vocab["<image>"] = reserved_token_id
+    del vocab["<|extra_0|>"]
+    added_tokens.append(
+        {
+            "id": reserved_token_id,
+            "content": "<image>",
+            "single_word": False,
+            "lstrip": False,
+            "rstrip": False,
+            "normalized": False,
+            "special": True,
+        }
+    )
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    pre_tokenizer = {
+        "type": "ByteLevel",
+        "add_prefix_space": False,
+        "trim_offsets": True,
+        "use_regex": True,
+    }
+
+    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json
+    tokenizer_template = {
+        "version": "1.0",
+        "truncation": None,
+        "padding": None,
+        "added_tokens": added_tokens,
+        "normalizer": None,
+        "pre_tokenizer": pre_tokenizer,
+        "post_processor": None,
+        "decoder": {
+            "type": "ByteLevel",
+            "add_prefix_space": True,
+            "trim_offsets": True,
+            "use_regex": True,
+        },
+        "model": {
+            "type": "BPE",
+            "dropout": None,
+            "unk_token": None,
+            "continuing_subword_prefix": "",
+            "end_of_word_suffix": "",
+            "fuse_unk": False,
+            "byte_fallback": False,
+            "vocab": vocab,
+            "merges": merges,
+        },
+    }
+
+    # Save to files
+    with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp:
+        json.dump(vocab, fp, indent=2, ensure_ascii=False)
+
+    with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp:
+        json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)
+
+    with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp:
+        json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)
+
+    with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp:
+        json.dump(
+            {
+                "bos_token": "<|extra_203|>",
+                "eos_token": "<|extra_204|>",
+                "pad_token": "<|endoftext|>",
+            },
+            fp,
+            indent=2,
+            ensure_ascii=False,
+        )
+
+    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp:
+        fp.write("#version: 0.2\n")
+        fp.write("\n".join(merges))
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "^encoder": "model.vqmodel.encoder",
+    "^decoder": "model.vqmodel.decoder",
+    "^post_quant_conv": "model.vqmodel.post_quant_conv",
+    "^quant_conv": "model.vqmodel.quant_conv",
+    "^quantize": "model.vqmodel.quantize",
+    "^model": "text_model.model",
+    r"lm_head\.weight": "text_model.lm_head.weight",
+    r"^text_model\.model\.vqmodel": "vqmodel",
+    # rename QKV proj for the VQ-VAE model because we use SiglipAttention
+    r"\.q\.": ".q_proj.",
+    r"\.k\.": ".k_proj.",
+    r"\.v\.": ".v_proj.",
+    r"\.proj_out\.": ".out_proj.",
+    # move the attention norms outside of attention modules
+    r"mid\.attn_1\.norm\.": "mid.attn_norm.",
+    r"attn\.0\.norm\.": "attn_norms.0.",
+    r"attn\.1\.norm\.": "attn_norms.1.",
+    r"attn\.2\.norm\.": "attn_norms.2.",
+    r"attn\.3\.norm\.": "attn_norms.3.",
+    # isolate down/mid/up into separate classes for readability
+    r"\.down\.": ".down_block.down.",
+    r"\.up\.": ".up_block.up.",
+    r"\.mid\.": ".middle_block.",
+}
+
+
+def convert_state_dict_to_hf(old_state_dict, new_state_dict):
+    for key, value in old_state_dict.items():
+        # convert conv layers in attn to linear
+        if (
+            any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"])
+            and value.ndim == 4
+        ):
+            value = value.squeeze()
+
+        for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items():
+            key = re.sub(old_pattern, new_pattern, key)
+
+        new_state_dict[key] = value
+    return new_state_dict
+
+
+def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Convert and save processor
+    tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True)
+    convert_tiktoken(tokenizer_tiktoken, output_dir)
+    extra_special_tokens = extra_special_tokens = {
+        "image_token": "<image>",
+        "boi_token": "<|image start|>",
+        "eoi_token": "<|image end|>",
+        "image_wrapper_token": "<|image token|>",
+        "eof_token": "<|extra_201|>",
+    }
+    tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens)
+    tokenizer_converted.padding_side = "left"
+
+    image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id)
+    processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE)
+    processor.save_pretrained(output_dir)
+
+    # load models
+    model_llm = AutoModelForCausalLM.from_pretrained(
+        llm_model_id,
+        trust_remote_code=True,
+    )
+    model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True)
+    with open(f"{output_dir}/tokenizer.json", "r") as file:
+        tokenizer_config = json.load(file)
+    vocabulary_map = tokenizer_config["model"]["vocab"]
+
+    text_config = Emu3TextConfig(
+        max_position_embeddings=model_llm.config.max_position_embeddings,
+        rope_scaling={"rope_type": "default"},
+    )
+    config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map)
+
+    with init_empty_weights():
+        model = Emu3ForConditionalGeneration(config=config)
+        model.generation_config = GenerationConfig(
+            do_sample=True,
+            top_k=2048,
+            max_new_tokens=50_000,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+
+    state_dict = {}
+    state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict)
+    state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict)
+
+    model.load_state_dict(state_dict, assign=True, strict=True)
+    model.save_pretrained(output_dir, safe_serialization=True)
+
+    if hub_model_id is not None:
+        model.push_to_hub(hub_model_id)
+        processor.push_to_hub(hub_model_id)
+
+    if test_inference and llm_model_id.endswith("Chat"):
+        # Short inference on a few examples to check if generation makes sense
+        print("Loading the checkpoint in a Emu3 model...")
+        print("*" * 100)
+        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+        processor = Emu3Processor.from_pretrained(output_dir)
+
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": "You are a helpful assistant."},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Please tell me about this art work and its artist."},
+                    {"type": "image"},
+                ],
+            },
+        ]
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        image = Image.open(
+            requests.get(
+                "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True
+            ).raw
+        )
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
+        length = inputs.input_ids.shape[1]
+
+        out = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
+
+        print(f"Generation for single-image: {generated_text}")
+        print("*" * 100)
+    elif test_inference and llm_model_id.endswith("Gen"):
+        processor = Emu3Processor.from_pretrained(output_dir)
+        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
+
+        inputs = processor(
+            text=[
+                "a portrait of young girl. masterpiece, film grained, best quality.",
+                "a dog running under the rain",
+            ],
+            padding=True,
+            return_tensors="pt",
+            return_for_image_generation=True,
+        )
+        inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16)
+
+        neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
+        neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0")
+
+        image_sizes = inputs.pop("image_sizes")
+        HEIGHT, WIDTH = image_sizes[0]
+        VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
+
+        def prefix_allowed_tokens_fn(batch_id, input_ids):
+            height, width = HEIGHT, WIDTH
+            visual_tokens = VISUAL_TOKENS
+            image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device)
+            eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0]
+            eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0]
+            pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0]
+            eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
+            eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0]
+
+            position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0]
+            offset = input_ids.shape[0] - position
+            if offset % (width + 1) == 0:
+                return (eol_token_id,)
+            elif offset == (width + 1) * height + 1:
+                return (eof_token_id,)
+            elif offset == (width + 1) * height + 2:
+                return (eoi_token_id,)
+            elif offset == (width + 1) * height + 3:
+                return (eos_token_id,)
+            elif offset > (width + 1) * height + 3:
+                return (pad_token_id,)
+            else:
+                return visual_tokens
+
+        out = model.generate(
+            **inputs,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            negative_prompt_ids=neg_inputs.input_ids,
+            negative_prompt_attention_mask=neg_inputs.attention_mask,
+        )
+
+        image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH)
+        images = processor.postprocess(
+            list(image.float()), return_tensors="PIL.Image.Image"
+        )  # internally we convert to np but it's not supported in bf16 precision
+        for i, image in enumerate(images["pixel_values"]):
+            image.save(f"result_{i}.png")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--vq_model_id",
+        help="Model ID of Emu3 VQ-VAE on the hub",
+        default="BAAI/Emu3-VisionTokenizer",
+    )
+    parser.add_argument(
+        "--llm_model_id",
+        help="Model ID of Emu3 bacbone LLM on the hub",
+        default="BAAI/Emu3-Chat",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        help="Model ID in the hub where to push the model.",
+    )
+    parser.add_argument(
+        "--test_inference",
+        action="store_true",
+        help="Whether to load the model for generation to test it's converted correctly.",
+    )
+    args = parser.parse_args()
+    convert_model(
+        vq_model_id=args.vq_model_id,
+        llm_model_id=args.llm_model_id,
+        output_dir=args.output_dir,
+        hub_model_id=args.hub_model_id,
+        test_inference=args.test_inference,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
new file mode 100644
index 00000000000000..f28bc501ba169c
--- /dev/null
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -0,0 +1,552 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Dict, Iterable, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import convert_to_rgb, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    from PIL import Image
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class Emu3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Emu3 image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defaults to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_pad: bool = True,
+        min_pixels: int = 512 * 512,
+        max_pixels: int = 1024 * 1024,
+        spatial_factor: int = 8,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.spatial_factor = spatial_factor
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.spatial_factor,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        images = np.array(processed_images)
+        return images
+
+    def _pad_for_batching(
+        self,
+        pixel_values: List[np.ndarray],
+        image_sizes: List[List[int]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
+
+        Args:
+            pixel_values (`List[np.ndarray]`):
+                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
+            image_sizes (`List[List[int]]`):
+                A list of sizes for each image in `pixel_values` in (height, width) format.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            List[`np.ndarray`]: The padded images.
+        """
+
+        max_shape = (
+            max([size[0] for size in image_sizes]),
+            max([size[1] for size in image_sizes]),
+        )
+        pixel_values = [
+            pad(
+                image,
+                padding=((0, max_shape[0] - size[0]), (0, max_shape[1] - size[1])),
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image, size in zip(pixel_values, image_sizes)
+        ]
+        return pixel_values
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        do_pad: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        if images is not None:
+            images = make_batched_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        pixel_values = []
+        for image in images:
+            image = self._preprocess(
+                image,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                do_convert_rgb=do_convert_rgb,
+                input_data_format=input_data_format,
+            )
+            pixel_values.extend(image)
+
+        image_sizes = [image.shape[-2:] for image in pixel_values]
+        if do_pad:
+            pixel_values = self._pad_for_batching(pixel_values, image_sizes)
+            pixel_values = np.array(pixel_values)
+
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_sizes": image_sizes}, tensor_type=return_tensors
+        )
+
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Union[str, TensorType] = "PIL.Image.Image",
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
+        The parameters should be same as in preprocess.
+        Args:
+            images (`ImageInput`):
+                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = 1.0 / self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+        if isinstance(images[0], Image.Image):
+            return images if len(images) > 1 else images[0]
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        pixel_values = []
+        for image in images:
+            image = to_numpy_array(image)
+            if do_normalize:
+                image = self.unnormalize(
+                    image=image, image_mean=image_mean, image_std=image_std, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                pixel_values.append(Image.fromarray(image))
+            else:
+                pixel_values.extend(image)
+
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def unnormalize(
+        self,
+        image: np.array,
+        image_mean: Union[float, Iterable[float]],
+        image_std: Union[float, Iterable[float]],
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+        image = (image * image_std) + image_mean
+        Args:
+            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
+                Batch of pixel values to postprocess.
+            image_mean (`float` or `Iterable[float]`):
+                The mean to use for unnormalization.
+            image_std (`float` or `Iterable[float]`):
+                The standard deviation to use for unnormalization.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        num_channels = 3
+
+        if isinstance(image_mean, Iterable):
+            if len(image_mean) != num_channels:
+                raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(image_mean)}")
+        else:
+            image_mean = [image_mean] * num_channels
+
+        if isinstance(image_std, Iterable):
+            if len(image_std) != num_channels:
+                raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(image_std)}")
+        else:
+            image_std = [image_std] * num_channels
+
+        rev_image_mean = tuple(-mean / std for mean, std in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / std for std in image_std)
+        image = self.normalize(
+            image=image, mean=rev_image_mean, std=rev_image_std, input_data_format=input_data_format
+        )
+        return image
+
+
+__all__ = ["Emu3ImageProcessor"]
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
new file mode 100644
index 00000000000000..722d9078d28a09
--- /dev/null
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -0,0 +1,1954 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/emu3/modular_emu3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_emu3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import cached_property
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "Emu3Config"
+
+
+class Emu3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Emu3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Emu3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Emu3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Emu3DecoderLayer(nn.Module):
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Emu3Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Emu3MLP(config)
+        self.input_layernorm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.dropout(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Emu3VQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+
+    def forward(self, hidden_state: torch.Tensor):
+        batch_size, temporal, channels, height, width = hidden_state.shape
+        hidden_state = hidden_state.permute(0, 1, 3, 4, 2).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, channels)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_state_sum = torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+        embedding_sum = torch.sum(self.embedding.weight**2, dim=1)
+
+        # "bd,dn->bn",
+        distances = 2 * torch.matmul(hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        distances = hidden_state_sum + embedding_sum - distances
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        min_encoding_indices = min_encoding_indices.view(batch_size, temporal, height, width)
+        return min_encoding_indices
+
+
+class Emu3VQVAEEncoderConvDownsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, hidden_states):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEEncoderConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: Tuple[int],
+        stride: Tuple[int],
+    ):
+        super().__init__()
+
+        padding_sizes = [one_kernel - one_stride for one_kernel, one_stride in zip(kernel_size[1:], stride[1:])]
+        self.padding = ()
+        for pad_size in padding_sizes[::-1]:
+            self.padding += (pad_size // 2 + pad_size % 2, pad_size // 2)
+        self.padding += (2, 0)
+
+        self.conv = nn.Conv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = F.pad(hidden_states, self.padding)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAESpatialNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(
+            num_channels=out_channels,
+            num_groups=32,
+            eps=1e-6,
+            affine=True,
+        )
+
+        self.conv_y = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        quant_states = F.interpolate(quant_states, size=hidden_states.shape[-2:], mode="nearest")
+        hidden_states = self.norm_layer(hidden_states)
+        hidden_states = hidden_states * self.conv_y(quant_states) + self.conv_b(quant_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, channels, temporal, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 1, 3, 4, 2).contiguous().view(batch_size, -1, temporal)
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = hidden_states.view(batch_size, channels, height, width, -1).permute(0, 1, 4, 2, 3).contiguous()
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(4, 3, 3),
+            stride=(2, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.norm1 = nn.BatchNorm3d(in_channels)
+        self.conv1 = Emu3VQVAEConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        self.norm2 = nn.BatchNorm3d(out_channels)
+        self.conv2 = Emu3VQVAEConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        quant_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.quant_channels = quant_channels
+
+        if quant_channels is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.norm1 = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+            self.norm2 = Emu3VQVAESpatialNorm(quant_channels, out_channels)
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Tensor] = None):
+        norm_args = () if self.quant_channels is None else (quant_channels,)
+
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEAttentionBlock(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class Emu3VQVAEGroupNorm(nn.GroupNorm):
+    """
+    Same as the torch GroupNorm with the only difference that this ones accepts
+    an optional kwarg `quant_states` which is not used. This class makes it easier to
+    use SpatialNorm or GroupNorm without conditionals
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def forward(self, input, quant_states=None):
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+
+class Emu3VQVAEMiddleBlock(nn.Module):
+    def __init__(self, config, in_channels, quant_channels=None):
+        super().__init__()
+
+        self.block_1 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+        self.attn_1 = Emu3VQVAEAttentionBlock(config)
+        if quant_channels is None:
+            self.attn_norm = Emu3VQVAEGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.attn_norm = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+
+        self.block_2 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor = None):
+        hidden_states = self.block_1(hidden_states, quant_states)
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states, quant_states)
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+        hidden_states = self.attn_1(hidden_states)[0]
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+        hidden_states = residual + hidden_states
+        hidden_states = self.block_2(hidden_states, quant_states)
+        return hidden_states
+
+
+class Emu3VQVAEDownBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        channel_multiplier = config.channel_multiplier
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if config.attn_resolutions is not None and i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            down.attn_norms = attn_norms
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Emu3VQVAEEncoderConvDownsample(block_in)
+            self.down.append(down)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.down):
+            for i_block in range(self.num_res_blocks):
+                hidden_states = blocks.block[i_block](hidden_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+
+            if i_level != self.num_resolutions - 1:
+                hidden_states = blocks.downsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEUpBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_out = config.base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        quant_channels=quant_channels,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(Emu3VQVAESpatialNorm(quant_channels, block_in))
+
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            up.attn_norms = attn_norms
+            if i_level != 0:
+                up.upsample = Emu3VQVAEEncoderConvUpsample(block_in)
+
+            self.up.insert(0, up)
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.up[::-1]):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_states = blocks.block[i_block](hidden_states, quant_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states, quant_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+            if i_level != len(self.up) - 1:
+                hidden_states = blocks.upsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+        out_channels = 2 * latent_channels if double_latent else latent_channels
+        block_in = base_channels * channel_multiplier[-1]
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+        self.down_block = Emu3VQVAEDownBlock(config)
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        temporal_down_blocks = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        self.time_res_stack = nn.ModuleList()
+
+        for i in range(temporal_down_blocks):
+            conv = Emu3VQVAETemporalDownsample(out_channels, out_channels)
+            self.time_conv.append(conv)
+
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+            )
+            self.time_res_stack.append(time_res_conv)
+
+    def forward(self, pixel_values: torch.LongTensor):
+        temporal_dim = pixel_values.shape[1]
+        pixel_values = pixel_values.reshape(-1, *pixel_values.shape[2:])
+
+        # downsampling & middle
+        hidden_states = self.conv_in(pixel_values)
+        hidden_states = self.down_block(hidden_states)
+        hidden_states = self.middle_block(hidden_states)
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(-1, temporal_dim, *hidden_states.shape[1:])
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for conv in self.time_conv:
+            hidden_states = conv(hidden_states)
+            hidden_states *= torch.sigmoid(hidden_states)
+
+        for layer in self.time_res_stack:
+            hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        return hidden_states
+
+
+class Emu3VQVAEDecoder(nn.Module):
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+        self.time_res_stack = nn.ModuleList()
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=config.latent_channels, out_channels=config.latent_channels
+            )
+            self.time_res_stack.append(time_res_conv)
+
+        temp_upsample_block_num = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(temp_upsample_block_num):
+            conv = Emu3VQVAETemporalUpsample(config.latent_channels, config.latent_channels)
+            self.time_conv.append(conv)
+
+        self.conv_in = nn.Conv2d(
+            config.latent_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in, quant_channels=quant_channels)
+        self.up_block = Emu3VQVAEUpBlock(config)
+
+        block_in = config.base_channels * config.channel_multiplier[0]
+        self.norm_out = Emu3VQVAESpatialNorm(quant_channels, block_in)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        hidden_quant_states = torch.cat((hidden_states, quant_states), dim=0)
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for layer in self.time_res_stack:
+            hidden_quant_states = layer(hidden_quant_states)
+
+        for layer in self.time_conv:
+            hidden_quant_states = layer(hidden_quant_states)
+            hidden_quant_states *= torch.sigmoid(hidden_quant_states)
+
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+        hidden_states, quant_states = torch.chunk(hidden_quant_states, 2, dim=0)
+        hidden_states = hidden_states.reshape(-1, *hidden_states.shape[2:])
+        quant_states = quant_states.reshape(-1, *quant_states.shape[2:])
+
+        hidden_states = self.conv_in(hidden_states)
+
+        # middle & upsampling
+        hidden_states = self.middle_block(hidden_states, quant_states)
+        hidden_states = self.up_block(hidden_states, quant_states)
+
+        hidden_states = self.norm_out(hidden_states, quant_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+EMU3_VQ_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Emu3VQVAEConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
+    """,
+    EMU3_VQ_START_DOCSTRING,
+)
+class Emu3VQVAE(PreTrainedModel):
+    config_class = Emu3VQVAEConfig
+    base_model_prefix = "emuvideovq"
+    main_input_name = "pixel_values"
+    _no_split_modules = [
+        "Emu3VQVAETemporalResnetBlock",
+        "Emu3VQVAEAttentionBlock",
+        "Emu3VQVAEResnetBlock",
+        "Emu3VQVAEVectorQuantizer",
+    ]
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        self.config = config
+
+        self.encoder = Emu3VQVAEEncoder(config)
+        self.decoder = Emu3VQVAEDecoder(config)
+        self.quantize = Emu3VQVAEVectorQuantizer(config)
+        self.vision_spatial_factor = 2 ** (len(config.channel_multiplier) - 1)
+
+        self.quant_conv = Emu3VQVAEConv3d(
+            config.latent_channels, config.embed_dim, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.post_quant_conv = Emu3VQVAEConv3d(
+            config.embed_dim, config.latent_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.spatial_scale_factor = 2 ** (len(config.channel_multiplier) - 1)
+        self.eval()  # Emu3's VQ model is frozen
+
+        self.post_init()
+
+    def encode(self, pixel_values: torch.Tensor, image_sizes: torch.Tensor):
+        is_image = pixel_values.ndim == 4
+        if is_image:
+            temporal = self.config.temporal_downsample_factor
+            batch_size, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.unsqueeze(1).repeat(1, temporal, 1, 1, 1)
+        else:
+            batch_size, temporal, channels, height, width = pixel_values.shape
+
+        hidden_states = self.encoder(pixel_values)
+
+        # b t c h w -> b c t h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        hidden_states = self.quant_conv(hidden_states)
+
+        # b c t h w -> b t c h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        codes = self.quantize(hidden_states)
+
+        image_tokens = codes.squeeze(1) if is_image else codes
+
+        image_tokens = [
+            single_image[: int(size[0] / self.vision_spatial_factor), : int(size[1] / self.vision_spatial_factor)]
+            for single_image, size in zip(image_tokens, image_sizes)
+        ]
+
+        return image_tokens
+
+    def decode(self, hidden_states: torch.Tensor):
+        is_image = hidden_states.ndim == 3
+        if is_image:
+            hidden_states = hidden_states.unsqueeze(1)
+
+        batch_size, temporal, height, width = hidden_states.shape
+        quant = self.quantize.embedding(hidden_states.flatten())
+
+        channels = quant.shape[-1]
+        quant = quant.view(batch_size, temporal, height, width, channels).permute(0, 4, 1, 2, 3).contiguous()
+        post_quant = self.post_quant_conv(quant)
+
+        quant = quant.permute(0, 2, 1, 3, 4)
+        post_quant = post_quant.permute(0, 2, 1, 3, 4)
+
+        video = self.decoder(post_quant, quant)
+        video = video.reshape(
+            batch_size,
+            temporal * self.config.temporal_downsample_factor,
+            self.config.out_channels,
+            height * self.spatial_scale_factor,
+            width * self.spatial_scale_factor,
+        )
+        return video[:, 0] if is_image else video
+
+
+class Emu3ImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map):
+        self.vocab_map = vocab_map
+        self.eol_token_id = vocab_map.get("<|extra_200|>")
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([val for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def image_tokens_str(self):
+        return sorted([name for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def img2bpe(self):
+        return {int(token[-8:-2]): self.vocab_map[token] for token in self.image_tokens_str}
+
+    @cached_property
+    def bpe2img(self):
+        return {v: k for k, v in self.img2bpe.items()}
+
+    @cached_property
+    def bpe2img_mapping_tensor(self):
+        mapping = torch.zeros(max(self.bpe2img.keys()) + 1, dtype=torch.int)
+        for k, v in self.bpe2img.items():
+            mapping[k] = v
+        return mapping
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: List[torch.Tensor]) -> torch.Tensor:
+        device = img_batch.device
+        eol_row = torch.ones((img_batch.shape[0], 1), dtype=torch.int) * self.eol_token_id
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        img_tokens = torch.cat([img_tokens, eol_row], dim=-1)
+        return img_tokens.to(device)
+
+    def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_batch = img_batch[..., :-1]  # remove last row of EOL tokens
+        img_tokens = self.bpe2img_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+EMU3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Emu3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare emu3 Model outputting raw hidden-states without any specific head on top.",
+    EMU3_START_DOCSTRING,
+)
+class Emu3PreTrainedModel(PreTrainedModel):
+    config_class = Emu3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Emu3DecoderLayer",
+    ]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    _supports_param_buffer_assignment = False
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        std = self.config.get_text_config().initializer_range
+        if isinstance(module, Emu3VQVAE):
+            module.apply(module._init_weights)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class Emu3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Emu3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+EMU3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Emu3Text Model outputting raw hidden-states without any specific head on top.",
+    EMU3_START_DOCSTRING,
+)
+class Emu3TextModel(Emu3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Emu3TextDecoderLayer`]
+
+    Args:
+        config: Emu3TextConfig
+    """
+
+    def __init__(self, config: Emu3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Emu3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Emu3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+EMU3_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Has to be an instance of [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            The model will output the same cache type that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    config_class = Emu3TextConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="Emu3TextConfig")
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForCausalLM.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["text_model.lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_model = Emu3ForCausalLM._from_config(config.text_config)
+        self.vqmodel = Emu3VQVAE(config.vq_config)
+        self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image.
+        """
+        image_tokens_list = self.vqmodel.encode(pixel_values, image_sizes)
+        bpe_tokens_list = [self.vocabulary_mapping.convert_img2bpe(tokens).flatten() for tokens in image_tokens_list]
+        bpe_tokens = torch.cat(bpe_tokens_list)
+        return bpe_tokens
+
+    @torch.no_grad
+    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+            height (`int`):
+                Height of the generated image before upsampling.
+            width (`int`):
+                Width of the generated image before upsampling.
+        """
+        sequences = image_tokens[:, :-3].view(-1, height, width + 1)
+        image_tokens = self.vocabulary_mapping.convert_bpe2img(sequences)
+        image = self.vqmodel.decode(image_tokens)
+        return image
+
+    @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        >>> conversation = [
+        ...     {
+        ...     "role": "system",
+        ...     "content": [
+        ...         {"type": "text", "text": "You are a helpful assistant."},
+        ...         ],
+        ...     },
+        ...     {
+        ...     "role": "user",
+        ...     "content": [
+        ...         {"type": "image"},
+        ...         {"type": "text", "text": "Please describe the image."},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
+
+        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None:
+            image_tokens = self.get_image_tokens(pixel_values, image_sizes)
+            special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
+            image_tokens = image_tokens.to(input_ids.device, input_ids.dtype)
+            input_ids = input_ids.masked_scatter(special_image_mask, image_tokens)
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+
+        return outputs
+
+
+__all__ = ["Emu3ForConditionalGeneration", "Emu3ForCausalLM", "Emu3TextModel", "Emu3PreTrainedModel", "Emu3VQVAE"]
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
new file mode 100644
index 00000000000000..da6016dc266bf9
--- /dev/null
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -0,0 +1,1272 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import cached_property
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    CausalLMOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from ..chameleon.modeling_chameleon import (
+    ChameleonPreTrainedModel,
+    ChameleonVQVAEEncoderConvDownsample,
+)
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+)
+from ..siglip.modeling_siglip import SiglipAttention
+from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+_CONFIG_FOR_DOC = "Emu3Config"
+_CHECKPOINT_FOR_DOC = "Emu3-community/Emu3-Chat-hf"
+
+logger = logging.get_logger(__name__)
+
+
+# Has extra dropout which no other model in the library has
+class Emu3DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: Emu3Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.dropout(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Emu3VQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+
+    def forward(self, hidden_state: torch.Tensor):
+        batch_size, temporal, channels, height, width = hidden_state.shape
+        hidden_state = hidden_state.permute(0, 1, 3, 4, 2).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, channels)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_state_sum = torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+        embedding_sum = torch.sum(self.embedding.weight**2, dim=1)
+
+        # "bd,dn->bn",
+        distances = 2 * torch.matmul(hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        distances = hidden_state_sum + embedding_sum - distances
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        min_encoding_indices = min_encoding_indices.view(batch_size, temporal, height, width)
+        return min_encoding_indices
+
+
+class Emu3VQVAEEncoderConvDownsample(ChameleonVQVAEEncoderConvDownsample):
+    pass
+
+
+class Emu3VQVAEEncoderConvUpsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, hidden_states):
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAEConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+        kernel_size: Tuple[int],
+        stride: Tuple[int],
+    ):
+        super().__init__()
+
+        padding_sizes = [one_kernel - one_stride for one_kernel, one_stride in zip(kernel_size[1:], stride[1:])]
+        self.padding = ()
+        for pad_size in padding_sizes[::-1]:
+            self.padding += (pad_size // 2 + pad_size % 2, pad_size // 2)
+        self.padding += (2, 0)
+
+        self.conv = nn.Conv3d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = F.pad(hidden_states, self.padding)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAESpatialNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(
+            num_channels=out_channels,
+            num_groups=32,
+            eps=1e-6,
+            affine=True,
+        )
+
+        self.conv_y = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        quant_states = F.interpolate(quant_states, size=hidden_states.shape[-2:], mode="nearest")
+        hidden_states = self.norm_layer(hidden_states)
+        hidden_states = hidden_states * self.conv_y(quant_states) + self.conv_b(quant_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        batch_size, channels, temporal, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 1, 3, 4, 2).contiguous().view(batch_size, -1, temporal)
+        hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = hidden_states.view(batch_size, channels, height, width, -1).permute(0, 1, 4, 2, 3).contiguous()
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channel: int,
+        out_channel: int,
+    ):
+        super().__init__()
+        self.conv = Emu3VQVAEConv3d(
+            in_channel,
+            out_channel,
+            kernel_size=(4, 3, 3),
+            stride=(2, 1, 1),
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class Emu3VQVAETemporalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.norm1 = nn.BatchNorm3d(in_channels)
+        self.conv1 = Emu3VQVAEConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        self.norm2 = nn.BatchNorm3d(out_channels)
+        self.conv2 = Emu3VQVAEConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        quant_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.quant_channels = quant_channels
+
+        if quant_channels is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.norm1 = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+            self.norm2 = Emu3VQVAESpatialNorm(quant_channels, out_channels)
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Tensor] = None):
+        norm_args = () if self.quant_channels is None else (quant_channels,)
+
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, *norm_args)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class Emu3VQVAEAttentionBlock(SiglipAttention):
+    pass
+
+
+class Emu3VQVAEGroupNorm(nn.GroupNorm):
+    """
+    Same as the torch GroupNorm with the only difference that this ones accepts
+    an optional kwarg `quant_states` which is not used. This class makes it easier to
+    use SpatialNorm or GroupNorm without conditionals
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def forward(self, input, quant_states=None):
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+
+class Emu3VQVAEMiddleBlock(nn.Module):
+    def __init__(self, config, in_channels, quant_channels=None):
+        super().__init__()
+
+        self.block_1 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+        self.attn_1 = Emu3VQVAEAttentionBlock(config)
+        if quant_channels is None:
+            self.attn_norm = Emu3VQVAEGroupNorm(num_channels=in_channels, num_groups=32, eps=1e-6, affine=True)
+        else:
+            self.attn_norm = Emu3VQVAESpatialNorm(quant_channels, in_channels)
+
+        self.block_2 = Emu3VQVAEResnetBlock(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            quant_channels=quant_channels,
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor = None):
+        hidden_states = self.block_1(hidden_states, quant_states)
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states, quant_states)
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+        hidden_states = self.attn_1(hidden_states)[0]
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+        hidden_states = residual + hidden_states
+        hidden_states = self.block_2(hidden_states, quant_states)
+        return hidden_states
+
+
+class Emu3VQVAEDownBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        channel_multiplier = config.channel_multiplier
+
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if config.attn_resolutions is not None and i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            down.attn_norms = attn_norms
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Emu3VQVAEEncoderConvDownsample(block_in)
+            self.down.append(down)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.down):
+            for i_block in range(self.num_res_blocks):
+                hidden_states = blocks.block[i_block](hidden_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+
+            if i_level != self.num_resolutions - 1:
+                hidden_states = blocks.downsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEUpBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            attn_norms = nn.ModuleList()
+            block_out = config.base_channels * config.channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    Emu3VQVAEResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        quant_channels=quant_channels,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(Emu3VQVAEAttentionBlock(config))
+                    attn_norms.append(Emu3VQVAESpatialNorm(quant_channels, block_in))
+
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            up.attn_norms = attn_norms
+            if i_level != 0:
+                up.upsample = Emu3VQVAEEncoderConvUpsample(block_in)
+
+            self.up.insert(0, up)
+
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor):
+        for i_level, blocks in enumerate(self.up[::-1]):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden_states = blocks.block[i_block](hidden_states, quant_states)
+                if len(blocks.attn) > 0:
+                    residual = hidden_states
+                    hidden_states = blocks.attn_norms[i_block](hidden_states, quant_states)
+
+                    batch_size, channels, height, width = hidden_states.shape
+                    hidden_states = hidden_states.view(batch_size, channels, height * width).transpose(1, 2)
+                    hidden_states = blocks.attn[i_block](hidden_states)[0]
+
+                    hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+                    hidden_states = residual + hidden_states
+            if i_level != len(self.up) - 1:
+                hidden_states = blocks.upsample(hidden_states)
+
+        return hidden_states
+
+
+class Emu3VQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        base_channels = config.base_channels
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+        out_channels = 2 * latent_channels if double_latent else latent_channels
+        block_in = base_channels * channel_multiplier[-1]
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+        self.down_block = Emu3VQVAEDownBlock(config)
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in)
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        temporal_down_blocks = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        self.time_res_stack = nn.ModuleList()
+
+        for i in range(temporal_down_blocks):
+            conv = Emu3VQVAETemporalDownsample(out_channels, out_channels)
+            self.time_conv.append(conv)
+
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+            )
+            self.time_res_stack.append(time_res_conv)
+
+    def forward(self, pixel_values: torch.LongTensor):
+        temporal_dim = pixel_values.shape[1]
+        pixel_values = pixel_values.reshape(-1, *pixel_values.shape[2:])
+
+        # downsampling & middle
+        hidden_states = self.conv_in(pixel_values)
+        hidden_states = self.down_block(hidden_states)
+        hidden_states = self.middle_block(hidden_states)
+
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(-1, temporal_dim, *hidden_states.shape[1:])
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for conv in self.time_conv:
+            hidden_states = conv(hidden_states)
+            hidden_states *= torch.sigmoid(hidden_states)
+
+        for layer in self.time_res_stack:
+            hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        return hidden_states
+
+
+class Emu3VQVAEDecoder(nn.Module):
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__()
+
+        quant_channels = config.embed_dim
+        block_in = config.base_channels * config.channel_multiplier[-1]
+        self.time_res_stack = nn.ModuleList()
+        for _ in range(config.num_res_blocks):
+            time_res_conv = Emu3VQVAETemporalResnetBlock(
+                in_channels=config.latent_channels, out_channels=config.latent_channels
+            )
+            self.time_res_stack.append(time_res_conv)
+
+        temp_upsample_block_num = int(math.log2(config.temporal_downsample_factor))
+        self.time_conv = nn.ModuleList()
+        for i in range(temp_upsample_block_num):
+            conv = Emu3VQVAETemporalUpsample(config.latent_channels, config.latent_channels)
+            self.time_conv.append(conv)
+
+        self.conv_in = nn.Conv2d(
+            config.latent_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.middle_block = Emu3VQVAEMiddleBlock(config, block_in, quant_channels=quant_channels)
+        self.up_block = Emu3VQVAEUpBlock(config)
+
+        block_in = config.base_channels * config.channel_multiplier[0]
+        self.norm_out = Emu3VQVAESpatialNorm(quant_channels, block_in)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor, quant_states: torch.Tensor):
+        hidden_quant_states = torch.cat((hidden_states, quant_states), dim=0)
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+
+        # temporal convs
+        for layer in self.time_res_stack:
+            hidden_quant_states = layer(hidden_quant_states)
+
+        for layer in self.time_conv:
+            hidden_quant_states = layer(hidden_quant_states)
+            hidden_quant_states *= torch.sigmoid(hidden_quant_states)
+
+        hidden_quant_states = hidden_quant_states.permute(0, 2, 1, 3, 4)
+        hidden_states, quant_states = torch.chunk(hidden_quant_states, 2, dim=0)
+        hidden_states = hidden_states.reshape(-1, *hidden_states.shape[2:])
+        quant_states = quant_states.reshape(-1, *quant_states.shape[2:])
+
+        hidden_states = self.conv_in(hidden_states)
+
+        # middle & upsampling
+        hidden_states = self.middle_block(hidden_states, quant_states)
+        hidden_states = self.up_block(hidden_states, quant_states)
+
+        hidden_states = self.norm_out(hidden_states, quant_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+EMU3_VQ_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Emu3VQVAEConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
+    """,
+    EMU3_VQ_START_DOCSTRING,
+)
+class Emu3VQVAE(PreTrainedModel):
+    config_class = Emu3VQVAEConfig
+    base_model_prefix = "emuvideovq"
+    main_input_name = "pixel_values"
+    _no_split_modules = [
+        "Emu3VQVAETemporalResnetBlock",
+        "Emu3VQVAEAttentionBlock",
+        "Emu3VQVAEResnetBlock",
+        "Emu3VQVAEVectorQuantizer",
+    ]
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        self.config = config
+
+        self.encoder = Emu3VQVAEEncoder(config)
+        self.decoder = Emu3VQVAEDecoder(config)
+        self.quantize = Emu3VQVAEVectorQuantizer(config)
+        self.vision_spatial_factor = 2 ** (len(config.channel_multiplier) - 1)
+
+        self.quant_conv = Emu3VQVAEConv3d(
+            config.latent_channels, config.embed_dim, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.post_quant_conv = Emu3VQVAEConv3d(
+            config.embed_dim, config.latent_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1)
+        )
+        self.spatial_scale_factor = 2 ** (len(config.channel_multiplier) - 1)
+        self.eval()  # Emu3's VQ model is frozen
+
+        self.post_init()
+
+    def encode(self, pixel_values: torch.Tensor, image_sizes: torch.Tensor):
+        is_image = pixel_values.ndim == 4
+        if is_image:
+            temporal = self.config.temporal_downsample_factor
+            batch_size, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.unsqueeze(1).repeat(1, temporal, 1, 1, 1)
+        else:
+            batch_size, temporal, channels, height, width = pixel_values.shape
+
+        hidden_states = self.encoder(pixel_values)
+
+        # b t c h w -> b c t h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        hidden_states = self.quant_conv(hidden_states)
+
+        # b c t h w -> b t c h w
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        codes = self.quantize(hidden_states)
+
+        image_tokens = codes.squeeze(1) if is_image else codes
+
+        image_tokens = [
+            single_image[: int(size[0] / self.vision_spatial_factor), : int(size[1] / self.vision_spatial_factor)]
+            for single_image, size in zip(image_tokens, image_sizes)
+        ]
+
+        return image_tokens
+
+    def decode(self, hidden_states: torch.Tensor):
+        is_image = hidden_states.ndim == 3
+        if is_image:
+            hidden_states = hidden_states.unsqueeze(1)
+
+        batch_size, temporal, height, width = hidden_states.shape
+        quant = self.quantize.embedding(hidden_states.flatten())
+
+        channels = quant.shape[-1]
+        quant = quant.view(batch_size, temporal, height, width, channels).permute(0, 4, 1, 2, 3).contiguous()
+        post_quant = self.post_quant_conv(quant)
+
+        quant = quant.permute(0, 2, 1, 3, 4)
+        post_quant = post_quant.permute(0, 2, 1, 3, 4)
+
+        video = self.decoder(post_quant, quant)
+        video = video.reshape(
+            batch_size,
+            temporal * self.config.temporal_downsample_factor,
+            self.config.out_channels,
+            height * self.spatial_scale_factor,
+            width * self.spatial_scale_factor,
+        )
+        return video[:, 0] if is_image else video
+
+
+class Emu3ImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map):
+        self.vocab_map = vocab_map
+        self.eol_token_id = vocab_map.get("<|extra_200|>")
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([val for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def image_tokens_str(self):
+        return sorted([name for name, val in self.vocab_map.items() if name.startswith("<|visual token")])
+
+    @cached_property
+    def img2bpe(self):
+        return {int(token[-8:-2]): self.vocab_map[token] for token in self.image_tokens_str}
+
+    @cached_property
+    def bpe2img(self):
+        return {v: k for k, v in self.img2bpe.items()}
+
+    @cached_property
+    def bpe2img_mapping_tensor(self):
+        mapping = torch.zeros(max(self.bpe2img.keys()) + 1, dtype=torch.int)
+        for k, v in self.bpe2img.items():
+            mapping[k] = v
+        return mapping
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: List[torch.Tensor]) -> torch.Tensor:
+        device = img_batch.device
+        eol_row = torch.ones((img_batch.shape[0], 1), dtype=torch.int) * self.eol_token_id
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        img_tokens = torch.cat([img_tokens, eol_row], dim=-1)
+        return img_tokens.to(device)
+
+    def convert_bpe2img(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_batch = img_batch[..., :-1]  # remove last row of EOL tokens
+        img_tokens = self.bpe2img_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+class Emu3PreTrainedModel(ChameleonPreTrainedModel, Emu3VQVAE):
+    _no_split_modules = [
+        "Emu3DecoderLayer",
+    ]
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        std = self.config.get_text_config().initializer_range
+        if isinstance(module, Emu3VQVAE):
+            module.apply(module._init_weights)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+EMU3_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Has to be an instance of [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            The model will output the same cache type that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+EMU3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, max_num_images, max_num_tiles, channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
+            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
+            [`Emu3ImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Has to be an instance of [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+class Emu3TextModel(LlamaModel, Emu3PreTrainedModel):
+    def __init__(self, config: Emu3Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+
+class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin):
+    config_class = Emu3TextConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Emu3TextModel(config)
+
+    @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="Emu3TextConfig")
+    def forward(**super_kwargs):
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForCausalLM.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        super().forward()
+
+
+class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["text_model.lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_model = Emu3ForCausalLM._from_config(config.text_config)
+        self.vqmodel = Emu3VQVAE(config.vq_config)
+        self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
+                The sizes of the images in the batch, being (height, width) for each image.
+        """
+        image_tokens_list = self.vqmodel.encode(pixel_values, image_sizes)
+        bpe_tokens_list = [self.vocabulary_mapping.convert_img2bpe(tokens).flatten() for tokens in image_tokens_list]
+        bpe_tokens = torch.cat(bpe_tokens_list)
+        return bpe_tokens
+
+    @torch.no_grad
+    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
+        """
+        Decodes generated image tokens from language model to continuous pixel values
+        with VQGAN module via upsampling.
+
+        Args:
+            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
+                The tensors corresponding to the input images.
+            height (`int`):
+                Height of the generated image before upsampling.
+            width (`int`):
+                Width of the generated image before upsampling.
+        """
+        sequences = image_tokens[:, :-3].view(-1, height, width + 1)
+        image_tokens = self.vocabulary_mapping.convert_bpe2img(sequences)
+        image = self.vqmodel.decode(image_tokens)
+        return image
+
+    @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
+        >>> processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        >>> conversation = [
+        ...     {
+        ...     "role": "system",
+        ...     "content": [
+        ...         {"type": "text", "text": "You are a helpful assistant."},
+        ...         ],
+        ...     },
+        ...     {
+        ...     "role": "user",
+        ...     "content": [
+        ...         {"type": "image"},
+        ...         {"type": "text", "text": "Please describe the image."},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
+
+        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None:
+            image_tokens = self.get_image_tokens(pixel_values, image_sizes)
+            special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
+            image_tokens = image_tokens.to(input_ids.device, input_ids.dtype)
+            input_ids = input_ids.masked_scatter(special_image_mask, image_tokens)
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+
+        return outputs
+
+
+__all__ = [
+    "Emu3ForConditionalGeneration",
+    "Emu3ForCausalLM",
+    "Emu3TextModel",
+    "Emu3PreTrainedModel",
+    "Emu3VQVAE",
+]
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
new file mode 100644
index 00000000000000..2c536f5f24636f
--- /dev/null
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Emu3TextKwargs(TextKwargs, total=False):
+    return_for_image_generation: bool
+
+
+class Emu3ImagesKwargs(ImagesKwargs, total=False):
+    ratio: str
+    image_area: int
+
+
+class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: Emu3TextKwargs
+    images_kwargs: Emu3ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "return_for_image_generation": False,
+        },
+        "images_kwargs": {
+            "ratio": "1:1",
+            "image_area": 518400,
+        },
+    }
+
+
+class Emu3Processor(ProcessorMixin):
+    r"""
+    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
+    processor.
+
+    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
+    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.
+
+    Args:
+        image_processor ([`Emu3ImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`Emu3TokenizerFast`]):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
+    image_processor_class = "Emu3ImageProcessor"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.image_token = tokenizer.image_token  # image_token as placeholder to be replaced by vq-vae tokens
+        self.image_start_token = tokenizer.boi_token  # "<|image start|>" fixed tokens for start and end of image
+        self.image_end_token = tokenizer.eoi_token  # "<|image end|>"
+        self.fake_token_around_image = tokenizer.image_wrapper_token  # "<|image token|>"  every image starts with it
+        self.eof_token = tokenizer.eof_token  # "<|extra_201|>"
+        self.bos_token = tokenizer.bos_token
+        self.downsample_ratio = 8
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Emu3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        # check if images and text inputs are reversed for BC
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        output_kwargs = self._merge_kwargs(
+            Emu3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_for_image_generation = output_kwargs["text_kwargs"].pop("return_for_image_generation", False)
+        ratio = output_kwargs["images_kwargs"].pop("ratio", None)
+        image_area = output_kwargs["images_kwargs"].pop("image_area", None)
+
+        if return_for_image_generation and images is not None:
+            raise ValueError("You should not provide `images` when `return_for_image_generation=True`")
+
+        if not return_for_image_generation and text is None and images is None:
+            raise ValueError("You must provide either text or images when `return_for_image_generation=False`")
+
+        image_features = {}
+        image_start_tokens = f"{self.image_start_token}"
+        image_end_tokens = f"{self.eof_token}{self.image_end_token}"
+
+        # generate text from image + text input, so we add placeholders for image tokens
+        if not return_for_image_generation and images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_sizes = iter(image_features.image_sizes)
+
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    height, width = image_size
+                    height = height // self.downsample_ratio
+                    width = width // self.downsample_ratio
+                    image_seq_length = height * (width + 1)  # +1 for extra row when converting to BPE in modeling code
+
+                    image_placeholder = f"{image_start_tokens}{height}*{width}{self.fake_token_around_image}{'<placeholder>' * image_seq_length}{image_end_tokens}"
+                    sample = sample.replace(self.image_token, image_placeholder, 1)
+                    sample = f"{self.bos_token}{sample}"  # add BOS because PT tokenizer doesn't add it
+                prompt_strings.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+
+        # generate image from text input, so we add begin-of-image tokens from where image generation starts
+        elif return_for_image_generation:
+            height, width = self.calculate_generate_size(ratio, image_area, self.downsample_ratio)
+            image_prompt = f"{image_start_tokens}{height}*{width}{self.fake_token_around_image}"
+            text = [f"{self.bos_token}{sample}{image_prompt}" for sample in text]
+            image_features["image_sizes"] = [[height, width]] * len(text)
+
+        # else just generate from text-only input, and we do no special treatment for text
+        data = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        data.update(**image_features)
+
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+
+    def calculate_generate_size(self, ratio, image_area, spatial_factor):
+        width, height = map(int, ratio.split(":"))
+        current_area = width * height
+        target_ratio = (image_area / current_area) ** 0.5
+
+        token_height = int(round(height * target_ratio / spatial_factor))
+        token_width = int(round(width * target_ratio / spatial_factor))
+        return token_height, token_width
+
+    def postprocess(self, images: ImageInput, **kwargs):
+        return self.image_processor.postprocess(images, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Emu3TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Emu3TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["Emu3Processor"]
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index 9339c2645374b4..6fe658a7057fc3 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -576,7 +576,7 @@ def _encode_frame(
         scale = None
         if self.config.normalize:
             # if the padding is non zero
-            input_values = input_values * padding_mask
+            input_values = input_values * padding_mask.unsqueeze(1)
             mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
             scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
             input_values = input_values / scale
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index d600da4448808a..c0fad1ab66d53d 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -110,13 +110,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Falcon
 class FalconRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: FalconConfig,
-        device=None,
-    ):
+    def __init__(self, config: FalconConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -128,7 +123,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -140,13 +135,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 254af58c92d78d..7ed2302ef594f8 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -421,7 +421,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index da9319d3a98aa9..080ff772e28898 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -464,7 +464,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         batch_images = [[to_numpy_array(image) for image in images] for images in batch_images]
 
-        if is_scaled_image(batch_images[0][0]) and do_rescale:
+        if do_rescale and is_scaled_image(batch_images[0][0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index a7afb411c44805..79f82b5ac4bd99 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -151,9 +151,9 @@ def __init__(self, config: FuyuConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.text_config.vocab_size
-        self.language_model = AutoModelForCausalLM.from_config(
-            config.text_config, attn_implementation=config._attn_implementation
-        )
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
         self.vision_embed_tokens = nn.Linear(
             config.patch_size * config.patch_size * config.num_channels, config.hidden_size
@@ -181,9 +181,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def gather_continuous_embeddings(
         self,
         word_embeddings: torch.Tensor,
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 71cd6b6158ca0b..66e975edaa53cb 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -92,13 +92,8 @@ def forward(self, x):
 
 
 class GemmaRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GemmaConfig,
-        device=None,
-    ):
+    def __init__(self, config: GemmaConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -110,7 +105,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -122,13 +117,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -387,6 +383,7 @@ class GemmaPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index 67fc6c86a3bac6..116504a7312799 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -324,13 +324,8 @@ def forward(
 
 
 class Gemma2RotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: Gemma2Config,
-        device=None,
-    ):
+    def __init__(self, config: Gemma2Config, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -342,7 +337,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -354,13 +349,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -417,6 +413,7 @@ class Gemma2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -555,6 +552,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -640,6 +638,7 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index 48b12411361aff..f73b9ea840aec8 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -378,6 +378,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -463,6 +464,7 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 706847650b818e..3e5107c561df09 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -62,7 +62,6 @@ def __init__(self, config):
         self.config = config
         self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
         self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
-
         self.activation_fn = ACT2FN[config.hidden_act]
 
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
@@ -256,13 +255,8 @@ def extra_repr(self):
 
 
 class GlmRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GlmConfig,
-        device=None,
-    ):
+    def __init__(self, config: GlmConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -274,7 +268,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -286,13 +280,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -403,6 +398,7 @@ class GlmPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 0ef93c21d9491e..3941bc54dcde7b 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -202,7 +202,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(img) for img in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index 07b48faad4e35c..81e67a818deebe 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -14,11 +14,8 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-import json
 from typing import Optional, Tuple
 
-from tokenizers import pre_tokenizers
-
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
@@ -109,14 +106,6 @@ def __init__(
 
         self.add_bos_token = kwargs.pop("add_bos_token", False)
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
     def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
         is_split_into_words = kwargs.get("is_split_into_words", False)
         assert self.add_prefix_space or not is_split_into_words, (
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 1bc737379c02e5..beed2430b44b5c 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -491,13 +491,8 @@ def __init__(self, config, layer_idx=None):
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoX
 class GPTNeoXRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GPTNeoXConfig,
-        device=None,
-    ):
+    def __init__(self, config: GPTNeoXConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -509,7 +504,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -521,13 +516,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index 1df53f3776d5b9..d2ea1c3984fc3a 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 """Tokenization classes for GPTNeoX."""
 
-import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
@@ -122,14 +121,6 @@ def __init__(
         self._add_eos_token = add_eos_token
         self.update_post_processor()
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
     @property
     def add_eos_token(self):
         return self._add_eos_token
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 6c5f4b8240e6db..6a9ae6b50f9012 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -225,13 +225,8 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
 
 # Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese
 class GPTNeoXJapaneseRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GPTNeoXJapaneseConfig,
-        device=None,
-    ):
+    def __init__(self, config: GPTNeoXJapaneseConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -243,7 +238,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -255,13 +250,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py
index ed6191adf65b58..404d60ca32a3e0 100644
--- a/src/transformers/models/granite/configuration_granite.py
+++ b/src/transformers/models/granite/configuration_granite.py
@@ -112,6 +112,16 @@ class GraniteConfig(PretrainedConfig):
 
     model_type = "granite"
     keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `GraniteModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
 
     def __init__(
         self,
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index 7e758947b6dd8a..3c887d3a1b91fc 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -309,13 +309,8 @@ def forward(
 
 
 class GraniteRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GraniteConfig,
-        device=None,
-    ):
+    def __init__(self, config: GraniteConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -327,7 +322,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -339,13 +334,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -402,6 +398,7 @@ class GranitePreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index 0acddecdd59082..77ab0cece3eacd 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -158,13 +158,8 @@ def extra_repr(self):
 
 # Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
 class GraniteMoeRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: GraniteMoeConfig,
-        device=None,
-    ):
+    def __init__(self, config: GraniteMoeConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -176,7 +171,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -188,13 +183,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 23f4b719698cb9..f3d2117ca868f5 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -1440,7 +1440,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 4a101b1d93b4f7..283409327bf914 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -2104,9 +2104,7 @@ def __init__(self, config: GroundingDinoConfig):
             )
 
         # Create text backbone
-        self.text_backbone = AutoModel.from_config(
-            config.text_config, add_pooling_layer=False, attn_implementation=config._attn_implementation
-        )
+        self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False)
         self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
 
         if config.embedding_init_target or not config.two_stage:
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index c2f38ef3dcbe59..889b200552d51a 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -446,6 +446,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
diff --git a/src/transformers/models/helium/__init__.py b/src/transformers/models/helium/__init__.py
new file mode 100644
index 00000000000000..73d0966e5c1c48
--- /dev/null
+++ b/src/transformers/models/helium/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_helium import *
+    from .modeling_helium import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py
new file mode 100644
index 00000000000000..58ed1c106c82ae
--- /dev/null
+++ b/src/transformers/models/helium/configuration_helium.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class HeliumConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HeliumModel`]. It is used to instantiate an Helium
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Helium 2b model.
+    e.g. [kyutai/helium-2b](https://huggingface.co/kyutai/helium-2b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 48000):
+            Vocabulary size of the Helium model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HeliumModel`]
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 7040):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 20):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-08):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The base period of the RoPE embeddings.
+        pad_token_id (`int`, *optional*, defaults to 3):
+            Padding token id.
+        eos_token_id (`int` | `list`, *optional*, defaults to 2):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+    ```python
+    >>> from transformers import HeliumModel, HeliumConfig
+    >>> # Initializing a Helium 2b style configuration
+    >>> configuration = HeliumConfig()
+    >>> # Initializing a model from the Helium 2b style configuration
+    >>> model = HeliumModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "helium"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=48000,
+        hidden_size=2560,
+        intermediate_size=7040,
+        num_hidden_layers=24,
+        num_attention_heads=20,
+        num_key_value_heads=20,
+        head_dim=128,
+        hidden_act="silu",
+        attention_dropout=0.0,
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-8,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=100000.0,
+        pad_token_id=3,
+        eos_token_id=2,
+        bos_token_id=1,
+        attention_bias=False,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["HeliumConfig"]
diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py
new file mode 100644
index 00000000000000..7eed89b4af3392
--- /dev/null
+++ b/src/transformers/models/helium/modeling_helium.py
@@ -0,0 +1,1065 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/helium/modular_helium.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_helium.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_helium import HeliumConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/helium-7b"
+_CONFIG_FOR_DOC = "HeliumConfig"
+
+
+class HeliumRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HeliumRotaryEmbedding(nn.Module):
+    def __init__(self, config: HeliumConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class HeliumMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class HeliumAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = 1 / math.sqrt(self.head_dim)
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HeliumDecoderLayer(nn.Module):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = HeliumAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = HeliumMLP(config)
+        self.input_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+HELIUM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`HeliumConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Helium Model outputting raw hidden-states without any specific head on top.",
+    HELIUM_START_DOCSTRING,
+)
+class HeliumPreTrainedModel(PreTrainedModel):
+    config_class = HeliumConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HeliumDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+HELIUM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Helium Model outputting raw hidden-states without any specific head on top.",
+    HELIUM_START_DOCSTRING,
+)
+class HeliumModel(HeliumPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HeliumDecoderLayer`]
+
+    Args:
+        config: HeliumConfig
+    """
+
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HeliumRotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.model = HeliumModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, HeliumForCausalLM
+
+        >>> model = HeliumForCausalLM.from_pretrained("google/helium-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/helium-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Helium Model transformer with a sequence classification head on top (linear layer).
+
+    [`HeliumForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    HELIUM_START_DOCSTRING,
+)
+class HeliumForSequenceClassification(HeliumPreTrainedModel):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HeliumModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Helium Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    HELIUM_START_DOCSTRING,
+)
+class HeliumForTokenClassification(HeliumPreTrainedModel):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HeliumModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "HeliumPreTrainedModel",
+    "HeliumModel",
+    "HeliumForCausalLM",
+    "HeliumForSequenceClassification",
+    "HeliumForTokenClassification",
+]
diff --git a/src/transformers/models/helium/modular_helium.py b/src/transformers/models/helium/modular_helium.py
new file mode 100644
index 00000000000000..0c9be5ec805afd
--- /dev/null
+++ b/src/transformers/models/helium/modular_helium.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2024 The Kyutai and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...utils import logging
+from ..gemma.modeling_gemma import (
+    GemmaForCausalLM,
+    GemmaForSequenceClassification,
+    GemmaForTokenClassification,
+)
+from ..granite.modeling_granite import (
+    GraniteAttention,
+)
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRotaryEmbedding,
+)
+from .configuration_helium import HeliumConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class HeliumRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HeliumRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class HeliumMLP(LlamaMLP):
+    pass
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class HeliumAttention(GraniteAttention):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.scaling = 1 / math.sqrt(self.head_dim)
+
+
+class HeliumDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+
+        self.mlp = HeliumMLP(config)
+        self.input_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class HeliumPreTrainedModel(LlamaPreTrainedModel):
+    pass
+
+
+class HeliumModel(HeliumPreTrainedModel, LlamaModel):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HeliumRotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class HeliumForCausalLM(GemmaForCausalLM):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.model = HeliumModel(config)
+        self.post_init()
+
+
+class HeliumForSequenceClassification(GemmaForSequenceClassification):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.model = HeliumModel(config)
+        self.post_init()
+
+
+class HeliumForTokenClassification(GemmaForTokenClassification):
+    def __init__(self, config: HeliumConfig):
+        super().__init__(config)
+        self.model = HeliumModel(config)
+        self.post_init()
+
+
+__all__ = [
+    "HeliumPreTrainedModel",
+    "HeliumModel",
+    "HeliumForCausalLM",
+    "HeliumForSequenceClassification",
+    "HeliumForTokenClassification",
+]
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index 22ef9c2eac6e48..65d5a8285416f0 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -529,7 +529,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
-        if is_scaled_image(images_list[0][0]) and do_rescale:
+        if do_rescale and is_scaled_image(images_list[0][0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index bdae2c70d5f28b..4e819811a9849d 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1285,13 +1285,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.text_model.resize_token_embeddings(
-            new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of
-        )
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def inputs_merger(
         self,
         input_ids: torch.LongTensor,
@@ -1515,32 +1508,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
-        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        if new_num_tokens is None and pad_to_multiple_of is None:
-            return model_embeds
-
-        # Update base model and current model config
-        # Ignore copy
-        self.config.text_config.vocab_size = model_embeds.weight.shape[0]
-        self.vocab_size = self.config.text_config.vocab_size
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-
     @add_start_docstrings_to_model_forward(IDEFICS2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 9005c38af6c072..df71a8bf0e858d 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -758,7 +758,7 @@ def preprocess(
                 [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
             ]
 
-        if is_scaled_image(images_list[0][0]) and do_rescale:
+        if do_rescale and is_scaled_image(images_list[0][0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index e4ab65e7b777d5..31cf1a2e8f1173 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -899,7 +899,7 @@ def inputs_merger(
         new_inputs_embeds = inputs_embeds.clone()
         reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size)
         # cast to the dtype of the input_embeds to support quantized models
-        reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.dtype)
+        reshaped_image_hidden_states = reshaped_image_hidden_states.to(inputs_embeds.device, inputs_embeds.dtype)
         new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states
         return new_inputs_embeds
 
@@ -1094,17 +1094,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        """
-        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
-        """
-        output_embeddings = self.get_output_embeddings()
-        input_embeddings = self.get_input_embeddings()
-
-        if getattr(self.config, "tie_word_embeddings", True):
-            output_embeddings.weight = input_embeddings.weight
-
     @add_start_docstrings_to_model_forward(IDEFICS3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Idefics3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index cca6c8a14b21e0..357baf70d6cf5d 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -258,7 +258,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_normalize:
+        if do_normalize and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
                 "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index d4f15ab0a58e5f..328d64761a5667 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -302,9 +302,6 @@ def __init__(
         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
-        self.tie_word_embeddings = self.text_config.tie_word_embeddings
-        self.is_encoder_decoder = self.text_config.is_encoder_decoder
-
         self.num_query_tokens = num_query_tokens
         self.image_token_index = image_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
index 14687a96e54f37..6776c1b62b8852 100644
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -308,9 +308,6 @@ def __init__(
         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
-        self.tie_word_embeddings = self.text_config.tie_word_embeddings
-        self.is_encoder_decoder = self.text_config.is_encoder_decoder
-
         self.num_query_tokens = num_query_tokens
         self.video_token_index = video_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index b83df54785fa14..75e07317b05f62 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -324,7 +324,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled video frames. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 7184955af3aa56..1376e85c6f9525 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -137,9 +137,6 @@ def __init__(
         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
-        self.tie_word_embeddings = self.text_config.tie_word_embeddings
-        self.is_encoder_decoder = self.text_config.is_encoder_decoder
-
         self.num_query_tokens = num_query_tokens
         self.video_token_index = video_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 56eba05e65333f..433ca61fabec78 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -386,13 +386,8 @@ def extra_repr(self):
 
 # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->JetMoe
 class JetMoeRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: JetMoeConfig,
-        device=None,
-    ):
+    def __init__(self, config: JetMoeConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -404,7 +399,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -416,13 +411,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 11db4ad23db431..61c73d38b38d2f 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -324,7 +324,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index 934f04937b08f0..ff67d233ffe97b 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -20,7 +20,7 @@
 import json
 from typing import Dict, List, Optional, Tuple, Union
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import (
     BatchEncoding,
@@ -162,14 +162,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
         if tokenizer_component_instance:
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index 5b36f513f5a2a0..06e959e87542c5 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -17,7 +17,7 @@
 import json
 from typing import Dict, List, Optional, Tuple, Union
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding, EncodedInput
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -157,14 +157,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index f2cb89620d7614..b20a08e20bf462 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -274,7 +274,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index b8bb43492eef17..646c06bdc4ba34 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -124,7 +124,7 @@ class LlamaConfig(PretrainedConfig):
         mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
         head_dim (`int`, *optional*):
-            The attention head dimension. If None, it will default to hidden_size // num_heads
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
 
     ```python
     >>> from transformers import LlamaModel, LlamaConfig
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 53010536d46524..8cbb12628c0a39 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -80,13 +80,8 @@ def extra_repr(self):
 
 
 class LlamaRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: LlamaConfig,
-        device=None,
-    ):
+    def __init__(self, config: LlamaConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -98,7 +93,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -110,13 +105,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -391,6 +387,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index 68ec84b4d3276d..58bf40d6ce339d 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -50,6 +50,8 @@ class LlavaConfig(PretrainedConfig):
             The index of the layer to select the vision feature.
         image_seq_length (`int`, *optional*, defaults to 576):
             Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
 
     Example:
 
@@ -85,6 +87,7 @@ def __init__(
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
         image_seq_length=576,
+        multimodal_projector_bias=True,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -127,6 +130,7 @@ def __init__(
             text_config = CONFIG_MAPPING["llama"]()
 
         self.text_config = text_config
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 77cfc57b42bc86..93d7465291cb19 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -86,10 +86,13 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
 class LlavaMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaConfig):
         super().__init__()
-
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
 
     def forward(self, image_features):
         hidden_states = self.linear_1(image_features)
@@ -239,7 +242,12 @@ def __init__(self, config: LlavaConfig):
         self.multi_modal_projector = LlavaMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
         self.post_init()
 
     def get_input_embeddings(self):
@@ -260,16 +268,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def get_image_features(
         self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str
     ):
@@ -461,18 +459,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values,
@@ -480,66 +469,14 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            # prefill stage vs decoding stage (legacy behavior copied)
-            if input_ids.shape[1] != 1:
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
             n_image_features = image_features.shape[0] * image_features.shape[1]
-
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                 )
-            special_image_mask = (
-                (input_ids == self.config.image_token_index)
-                .unsqueeze(-1)
-                .expand_as(inputs_embeds)
-                .to(inputs_embeds.device)
-            )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 8dca2e16dff6af..630ccdce143439 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -154,27 +154,19 @@ def __call__(
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
         if image_inputs.get("pixel_values") is not None:
-            if self.patch_size is not None and self.vision_feature_select_strategy is not None:
-                # Replace the image token with the expanded image token sequence
-                pixel_values = image_inputs["pixel_values"]
-                height, width = get_image_size(to_numpy_array(pixel_values[0]))
-                num_image_tokens = (height // self.patch_size) * (
-                    width // self.patch_size
-                ) + self.num_additional_image_tokens
-                if self.vision_feature_select_strategy == "default":
-                    num_image_tokens -= self.num_additional_image_tokens
-
-                prompt_strings = []
-                for sample in text:
-                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-                    prompt_strings.append(sample)
-            else:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-                )
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         return BatchFeature(data={**text_inputs, **image_inputs})
diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py
index 2251a330aa4e6b..6cb76c5b9d15c0 100644
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig):
             Whether the model's input and output word embeddings should be tied.
         image_seq_length (`int`, *optional*, defaults to 576):
             Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
 
     Example:
 
@@ -92,12 +94,14 @@ def __init__(
         image_grid_pinpoints=None,
         tie_word_embeddings=False,
         image_seq_length=576,
+        multimodal_projector_bias=True,
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
         self.image_seq_length = image_seq_length
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index e43d95b80a0ada..8e2a4f4644fc3f 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -697,7 +697,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index de5a030b61c172..51df47233b26c0 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -194,10 +194,13 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
 class LlavaNextMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaNextConfig):
         super().__init__()
-
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
 
     def forward(self, image_features):
         hidden_states = self.linear_1(image_features)
@@ -354,6 +357,9 @@ def __init__(self, config: LlavaNextConfig):
 
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         self.post_init()
@@ -392,18 +398,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_image_features(
         self,
         image_features,
@@ -689,7 +683,9 @@ def pack_image_features(self, image_features, image_sizes, vision_feature_select
                     image_feature = torch.cat(
                         (
                             image_feature,
-                            image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
                         ),
                         dim=-1,
                     )
@@ -835,18 +831,9 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -863,70 +850,14 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            if input_ids.shape[1] != 1:
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
             n_image_features = image_features.shape[0]
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                 )
-            special_image_mask = (
-                (input_ids == self.config.image_token_index)
-                .unsqueeze(-1)
-                .expand_as(inputs_embeds)
-                .to(inputs_embeds.device)
-            )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 462006bc99ea4d..d8034ca9fa56f2 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -149,30 +149,22 @@ def __call__(
 
         prompt_strings = text
         if image_inputs:
-            if self.patch_size is None or self.vision_feature_select_strategy is None:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-                )
-            else:
-                image_sizes = iter(image_inputs["image_sizes"])
-                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
-                prompt_strings = []
-                for sample in text:
-                    while self.image_token in sample:
-                        image_size = next(image_sizes)
-                        if not isinstance(image_size, (list, tuple)):
-                            # cast to list to avoid numerical precision errors when calculating unpadding
-                            image_size = image_size.tolist()
-                        orig_height, orig_width = image_size
-                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
-                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= self.num_additional_image_tokens
-                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
-                    prompt_strings.append(sample)
-                prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    if not isinstance(image_size, (list, tuple)):
+                        # cast to list to avoid numerical precision errors when calculating unpadding
+                        image_size = image_size.tolist()
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+                    sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                prompt_strings.append(sample)
+            prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index e608e5a0d20ece..77089ed0f365ca 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -44,6 +44,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
             The image token index to encode the image prompt.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
@@ -95,6 +97,7 @@ def __init__(
         ignore_index=-100,
         image_token_index=32001,
         projector_hidden_act="gelu",
+        multimodal_projector_bias=True,
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
         image_grid_pinpoints=None,
@@ -114,6 +117,7 @@ def __init__(
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index f30e2c54fe90a3..81f55f9373be09 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -263,7 +263,7 @@ def _preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 7cd7e18abaf3e0..257c81aa8fe4df 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -179,10 +179,13 @@ def _init_weights(self, module):
 class LlavaNextVideoMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaNextVideoConfig):
         super().__init__()
-
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
 
     def forward(self, image_features):
         hidden_states = self.linear_1(image_features)
@@ -394,6 +397,9 @@ def __init__(
 
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         self.vision_resampler = LlavaNextVideoPooler(config)
@@ -427,16 +433,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_image_features(
         self,
         image_features,
@@ -722,7 +718,9 @@ def pack_image_features(self, image_features, image_sizes, vision_feature_select
                     image_feature = torch.cat(
                         (
                             image_feature,
-                            image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
                         ),
                         dim=-1,
                     )
@@ -909,25 +907,9 @@ def forward(
                 "and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -942,7 +924,17 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = video_feature_lens = None
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self.get_video_features(
                 pixel_values_videos,
@@ -954,95 +946,16 @@ def forward(
             video_features = torch.cat(video_features, dim=0)
             video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                iterator = (
-                    (image_features, feature_lens, self.config.image_token_index),
-                    (video_features, video_feature_lens, self.config.video_token_index),
-                )
-                for features, lens, special_token in iterator:
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            features,
-                            lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=special_token,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if image_features is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0]
-
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-            if video_features is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index 94c1432a41b1f1..89975a745b790c 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -58,6 +58,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
             The image token index to encode the image prompt.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
@@ -109,6 +111,7 @@ def __init__(
         ignore_index=-100,
         image_token_index=32001,
         projector_hidden_act="gelu",
+        multimodal_projector_bias=True,
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
         image_grid_pinpoints=None,
@@ -128,6 +131,7 @@ def __init__(
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
@@ -431,25 +435,9 @@ def forward(
                 "and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (pixel_values is not None or pixel_values_videos is not None)
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self.get_image_features(
                 pixel_values,
@@ -464,7 +452,17 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = video_feature_lens = None
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self.get_video_features(
                 pixel_values_videos,
@@ -476,95 +474,16 @@ def forward(
             video_features = torch.cat(video_features, dim=0)
             video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-            if input_ids.shape[1] != 1:
-                iterator = (
-                    (image_features, feature_lens, self.config.image_token_index),
-                    (video_features, video_feature_lens, self.config.video_token_index),
-                )
-                for features, lens, special_token in iterator:
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            features,
-                            lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=special_token,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if image_features is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0]
-
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-            if video_features is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 857ee28a080041..e9307ee37c9e3a 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -173,48 +173,38 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is None or self.vision_feature_select_strategy is None:
-            logger.warning_once(
-                "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size`, `num_additional_image_tokens` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` "
-                "and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
-            )
-        else:
-            # images expand taking into account num_of_patches in each image
-            if image_inputs:
-                image_sizes = iter(image_inputs["image_sizes"])
-                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
-                prompt_strings = []
-                for sample in text:
-                    while self.image_token in sample:
-                        image_size = next(image_sizes)
-                        if not isinstance(image_size, (list, tuple)):
-                            # cast to list to avoid numerical precision errors when calculating unpadding
-                            image_size = image_size.tolist()
-                        orig_height, orig_width = image_size
-                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
-                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= self.num_additional_image_tokens
-                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
-                    prompt_strings.append(sample)
-                text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
-
-            # videos are easier, simply get frames and multiply
-            if videos_inputs:
-                one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
-                height, width = get_image_size(one_video[0])
-                num_frames = one_video.shape[0]  # frame dim is always after batch dim
-
-                # no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
-                num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
-                num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
-                prompt_strings = []
-                for sample in text:
-                    sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
-                    prompt_strings.append(sample)
-                text = prompt_strings
+        if image_inputs:
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    if not isinstance(image_size, (list, tuple)):
+                        # cast to list to avoid numerical precision errors when calculating unpadding
+                        image_size = image_size.tolist()
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+                    sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                prompt_strings.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+
+        # videos are easier, simply get frames and multiply
+        if videos_inputs:
+            one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
+            height, width = get_image_size(one_video[0])
+            num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            # no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
+            num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                prompt_strings.append(sample)
+            text = prompt_strings
 
         text_inputs = self.tokenizer(
             text,
diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
index 74be035a8f2679..504e8a7878be40 100644
--- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
@@ -58,6 +58,8 @@ class LlavaOnevisionConfig(PretrainedConfig):
             of the form `(height, width)`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
 
     Example:
 
@@ -95,11 +97,13 @@ def __init__(
         vision_aspect_ratio="anyres_max_9",
         image_grid_pinpoints=None,
         tie_word_embeddings=False,
+        multimodal_projector_bias=True,
         **kwargs,
     ):
         self.image_token_index = image_token_index
         self.video_token_index = video_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index cde9a643e3efc4..75581d25aefaa4 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -657,7 +657,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 4bcdf1aba82ae4..5c5471479e86bf 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -201,10 +201,13 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
 class LlavaOnevisionMultiModalProjector(nn.Module):
     def __init__(self, config: LlavaOnevisionConfig):
         super().__init__()
-
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
 
     def forward(self, image_features):
         hidden_states = self.linear_1(image_features)
@@ -371,6 +374,9 @@ def __init__(self, config: LlavaOnevisionConfig):
 
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.post_init()
 
     # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings
@@ -397,10 +403,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
         """
         Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index d42f287da25b22..81760fb4087e1c 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -164,7 +164,7 @@ def __call__(
         if videos is not None:
             video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
 
-            one_video = to_numpy_array(video_inputs["pixel_values_videos"][0])
+            one_video = to_numpy_array(video_inputs.get("pixel_values_videos")[0])
             height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
             num_frames = one_video.shape[0]  # frame dim is always after batch dim
             patches_height_width = int(math.sqrt(self.num_image_tokens))
diff --git a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
index 4391dd33088337..a5aa42688e65e6 100644
--- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
@@ -181,7 +181,7 @@ def _preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled videos. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index 3d3ca97a6f6457..b8111b3d8a25f3 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -17,7 +17,7 @@
 import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -155,14 +155,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
         if tokenizer_component_instance:
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index a7ef344f4e3b7d..ec6808348abd4f 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -21,7 +21,7 @@
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple, Union
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from ...tokenization_utils_base import (
@@ -207,14 +207,6 @@ def __init__(
 
         self.tags_dict = tags_dict
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
         if tokenizer_component_instance:
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 1184207744c764..6797fb48bc857d 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -614,7 +614,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 104b68cbed4448..87a3063a37d4c1 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -615,7 +615,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index e0d4c003771469..8342f83cfc54dd 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -365,13 +365,8 @@ def forward(self, x: torch.Tensor):
 
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
 class MimiRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: MimiConfig,
-        device=None,
-    ):
+    def __init__(self, config: MimiConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -383,7 +378,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -395,13 +390,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -1063,7 +1059,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Mimi
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1073,6 +1069,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Mimi. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index 90c38895b4280b..635cda9cc8f096 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -270,13 +270,8 @@ def forward(
 
 
 class MistralRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: MistralConfig,
-        device=None,
-    ):
+    def __init__(self, config: MistralConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -288,7 +283,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -300,13 +295,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -363,6 +359,7 @@ class MistralPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 84ed327d9be920..8cf2d0e8fa8da9 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -392,13 +392,8 @@ def forward(
 
 
 class MixtralRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: MixtralConfig,
-        device=None,
-    ):
+    def __init__(self, config: MixtralConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -410,7 +405,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -422,13 +417,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -485,6 +481,7 @@ class MixtralPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py
index cb9a20dadc677e..b40c366a6d75ef 100644
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@@ -1986,6 +1986,9 @@ def __init__(self, config: MllamaConfig):
 
         self.vision_model = MllamaVisionModel._from_config(config.vision_config)
         self.language_model = MllamaForCausalLM._from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.multi_modal_projector = nn.Linear(
             config.vision_config.vision_output_dim,
             config.text_config.hidden_size,
@@ -2011,9 +2014,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     @add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="MllamaConfig")
     def forward(
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index de15f4fa151e6e..f0092ca029638b 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -266,7 +266,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 68299bc085fa16..c93f95a2058e05 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -269,7 +269,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 2825aa647dc0b8..3a32a79b272af0 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -234,7 +234,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py
index 13e9edf067efc4..cc0295c25b55d7 100644
--- a/src/transformers/models/modernbert/configuration_modernbert.py
+++ b/src/transformers/models/modernbert/configuration_modernbert.py
@@ -109,6 +109,9 @@ class ModernBertConfig(PretrainedConfig):
             the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
             shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
             be faster in some scenarios.
+        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
+            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
+            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.
 
     Examples:
 
@@ -164,6 +167,7 @@ def __init__(
         sparse_prediction=False,
         sparse_pred_ignore_index=-100,
         reference_compile=None,
+        repad_logits_with_grad=False,
         **kwargs,
     ):
         super().__init__(
@@ -203,6 +207,7 @@ def __init__(
         self.sparse_prediction = sparse_prediction
         self.sparse_pred_ignore_index = sparse_pred_ignore_index
         self.reference_compile = reference_compile
+        self.repad_logits_with_grad = repad_logits_with_grad
 
         if self.classifier_pooling not in ["cls", "mean"]:
             raise ValueError(
diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index 167ccd155805fb..2fa5a08acc4831 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 
 import math
+from contextlib import nullcontext
 from typing import Dict, Optional, Tuple, Union
 
 import torch
@@ -30,6 +31,7 @@
 from ...activations import ACT2FN
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, MaskedLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
@@ -205,12 +207,17 @@ def __init__(self, config: ModernBertConfig):
     def compiled_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
         return self.drop(self.norm(self.tok_embeddings(input_ids)))
 
-    def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None) -> torch.Tensor:
-        hidden_states = (
-            self.compiled_embeddings(input_ids)
-            if self.config.reference_compile
-            else self.drop(self.norm(self.tok_embeddings(input_ids)))
-        )
+    def forward(
+        self, input_ids: torch.LongTensor = None, inputs_embeds: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = self.drop(self.norm(inputs_embeds))
+        else:
+            hidden_states = (
+                self.compiled_embeddings(input_ids)
+                if self.config.reference_compile
+                else self.drop(self.norm(self.tok_embeddings(input_ids)))
+            )
         return hidden_states
 
 
@@ -235,23 +242,50 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class ModernBertRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, config: ModernBertConfig, dim: int, base: float, device: Optional[torch.device] = None):
         super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(None, device, dim=dim, base=base)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
 
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
 
     @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        self.inv_freq.to(x.device)
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
@@ -259,6 +293,11 @@ def forward(self, x, position_ids, seq_len=None):
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
@@ -462,9 +501,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None):
                 dim=self.head_dim, max_seqlen=max_position_embeddings, base=rope_theta
             )
         else:
-            self.rotary_emb = ModernBertRotaryEmbedding(
-                dim=self.head_dim, max_position_embeddings=max_position_embeddings, base=rope_theta
-            )
+            self.rotary_emb = ModernBertRotaryEmbedding(config=config, dim=self.head_dim, base=rope_theta)
 
         self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
         self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity()
@@ -627,12 +664,14 @@ def _autoset_attn_implementation(
     ):
         # If the user didn't specify anything, try to use flash_attention_2 if available.
         # Otherwise we fall back to the default SDPA -> Eager from the super() method.
+        # ModernBert's FA2 implementation correctly handles non-fp16/bf16 dtypes, we don't
+        # need the FA2 warning for non-fp16/bf16 dtypes so we set fp16 for the FA2 check.
         if config._attn_implementation_internal is None:
             config._attn_implementation_internal = "flash_attention_2"
             try:
                 return cls._check_and_enable_flash_attn_2(
                     config,
-                    torch_dtype=torch_dtype,
+                    torch_dtype=torch.float16,
                     device_map=device_map,
                     hard_check_only=False,
                     check_device_map=check_device_map,
@@ -642,7 +681,7 @@ def _autoset_attn_implementation(
         return super()._autoset_attn_implementation(
             config,
             use_flash_attention_2=use_flash_attention_2,
-            torch_dtype=torch_dtype,
+            torch_dtype=torch.float16,
             device_map=device_map,
             check_device_map=check_device_map,
         )
@@ -667,6 +706,14 @@ def _maybe_set_compile(self):
                 )
             self.config.reference_compile = False
 
+        if self.device.type == "cpu":
+            if self.config.reference_compile:
+                logger.warning_once(
+                    "Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. "
+                    "Falling back to non-compiled mode."
+                )
+            self.config.reference_compile = False
+
         if self.config.reference_compile is None:
             self.config.reference_compile = is_triton_available()
 
@@ -758,8 +805,8 @@ def _pad_modernbert_output(
 MODERNBERT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
+            Indices of input sequence tokens in the vocabulary. With Flash Attention 2.0, padding will be ignored
+            by default should you provide it.
 
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
@@ -776,9 +823,6 @@ def _pad_modernbert_output(
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
@@ -788,22 +832,26 @@ def _pad_modernbert_output(
         sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
             perform global attention, while the rest perform local attention. This mask is used to avoid attending to
-            far-away tokens in the local attention layers.
+            far-away tokens in the local attention layers when not using Flash Attention.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
         indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
             Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
         cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
             Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
         max_seqlen (`int`, *optional*):
-            Maximum sequence length in the batch. Used to pad the output tensors.
+            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
         batch_size (`int`, *optional*):
             Batch size of the input sequences. Used to pad the output tensors.
         seq_len (`int`, *optional*):
-            Sequence length of the input sequences. Used to pad the output tensors.
+            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -845,10 +893,11 @@ def set_input_embeddings(self, value):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
@@ -864,35 +913,49 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
         self._maybe_set_compile()
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+
+        if input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
 
         if batch_size is None and seq_len is None:
-            batch_size, seq_len = input_ids.shape[:2]
+            if inputs_embeds is not None:
+                batch_size, seq_len = inputs_embeds.shape[:2]
+            else:
+                batch_size, seq_len = input_ids.shape[:2]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.bool)
+            attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)
 
         repad = False
         if self.config._attn_implementation == "flash_attention_2":
             if indices is None and cu_seqlens is None and max_seqlen is None:
                 repad = True
-                with torch.no_grad():
-                    input_ids, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
-                        inputs=input_ids, attention_mask=attention_mask
+                if inputs_embeds is None:
+                    with torch.no_grad():
+                        input_ids, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
+                            inputs=input_ids, attention_mask=attention_mask
+                        )
+                else:
+                    inputs_embeds, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
+                        inputs=inputs_embeds, attention_mask=attention_mask
                     )
         else:
             if position_ids is None:
-                position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
+                position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
 
             attention_mask, sliding_window_mask = self._update_attention_mask(
                 attention_mask, output_attentions=output_attentions
             )
 
-        hidden_states = self.embeddings(input_ids)
+        hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
 
         for encoder_layer in self.layers:
             if output_hidden_states:
@@ -1028,10 +1091,11 @@ def compiled_head(self, output: torch.Tensor) -> torch.Tensor:
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1048,19 +1112,32 @@ def forward(
 
         if self.config._attn_implementation == "flash_attention_2":
             if indices is None and cu_seqlens is None and max_seqlen is None:
-                batch_size, seq_len = input_ids.shape[:2]
+                if batch_size is None and seq_len is None:
+                    if inputs_embeds is not None:
+                        batch_size, seq_len = inputs_embeds.shape[:2]
+                    else:
+                        batch_size, seq_len = input_ids.shape[:2]
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+
                 if attention_mask is None:
-                    attention_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.bool)
-                with torch.no_grad():
-                    input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
-                        inputs=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels
+                    attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)
+
+                if inputs_embeds is None:
+                    with torch.no_grad():
+                        input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                            inputs=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels
+                        )
+                else:
+                    inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                        inputs=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, labels=labels
                     )
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
@@ -1093,8 +1170,9 @@ def forward(
             loss = self.loss_function(logits, labels, vocab_size=self.config.vocab_size)
 
         if self.config._attn_implementation == "flash_attention_2":
-            with torch.no_grad():
+            with nullcontext() if self.config.repad_logits_with_grad or labels is None else torch.no_grad():
                 logits = _pad_modernbert_output(inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len)
+
         if not return_dict:
             output = (logits,)
             return ((loss,) + output) if loss is not None else output
@@ -1133,10 +1211,11 @@ def __init__(self, config: ModernBertConfig):
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1158,10 +1237,11 @@ def forward(
         self._maybe_set_compile()
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
@@ -1244,10 +1324,11 @@ def __init__(self, config: ModernBertConfig):
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1266,10 +1347,11 @@ def forward(
         self._maybe_set_compile()
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index 4424e8b2fead5d..edfdc94346bf12 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import math
+from contextlib import nullcontext
 from typing import Dict, Literal, Optional, Tuple, Union
 
 import torch
@@ -40,7 +41,7 @@
     logging,
 )
 from ...utils.import_utils import is_triton_available
-from ..gemma.modeling_gemma import apply_rotary_pos_emb
+from ..gemma.modeling_gemma import GemmaRotaryEmbedding, apply_rotary_pos_emb
 
 
 if is_flash_attn_2_available():
@@ -141,6 +142,9 @@ class ModernBertConfig(PretrainedConfig):
             the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
             shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
             be faster in some scenarios.
+        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
+            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
+            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.
 
     Examples:
 
@@ -196,6 +200,7 @@ def __init__(
         sparse_prediction=False,
         sparse_pred_ignore_index=-100,
         reference_compile=None,
+        repad_logits_with_grad=False,
         **kwargs,
     ):
         super().__init__(
@@ -235,6 +240,7 @@ def __init__(
         self.sparse_prediction = sparse_prediction
         self.sparse_pred_ignore_index = sparse_pred_ignore_index
         self.reference_compile = reference_compile
+        self.repad_logits_with_grad = repad_logits_with_grad
 
         if self.classifier_pooling not in ["cls", "mean"]:
             raise ValueError(
@@ -464,12 +470,17 @@ def __init__(self, config: ModernBertConfig):
     def compiled_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
         return self.drop(self.norm(self.tok_embeddings(input_ids)))
 
-    def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None) -> torch.Tensor:
-        hidden_states = (
-            self.compiled_embeddings(input_ids)
-            if self.config.reference_compile
-            else self.drop(self.norm(self.tok_embeddings(input_ids)))
-        )
+    def forward(
+        self, input_ids: torch.LongTensor = None, inputs_embeds: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = self.drop(self.norm(inputs_embeds))
+        else:
+            hidden_states = (
+                self.compiled_embeddings(input_ids)
+                if self.config.reference_compile
+                else self.drop(self.norm(self.tok_embeddings(input_ids)))
+            )
         return hidden_states
 
 
@@ -493,32 +504,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.Wo(self.drop(self.act(input) * gate))
 
 
-class ModernBertRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        self.inv_freq.to(x.device)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class ModernBertRotaryEmbedding(GemmaRotaryEmbedding):
+    def __init__(self, config: ModernBertConfig, dim: int, base: float, device: Optional[torch.device] = None):
+        super().__init__(self, config=config, device=device)
+        inv_freq, self.attention_scaling = self.rope_init_fn(None, device, dim=dim, base=base)
 
 
 def eager_attention_forward(
@@ -687,9 +676,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None):
                 dim=self.head_dim, max_seqlen=max_position_embeddings, base=rope_theta
             )
         else:
-            self.rotary_emb = ModernBertRotaryEmbedding(
-                dim=self.head_dim, max_position_embeddings=max_position_embeddings, base=rope_theta
-            )
+            self.rotary_emb = ModernBertRotaryEmbedding(config=config, dim=self.head_dim, base=rope_theta)
 
         self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
         self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity()
@@ -852,12 +839,14 @@ def _autoset_attn_implementation(
     ):
         # If the user didn't specify anything, try to use flash_attention_2 if available.
         # Otherwise we fall back to the default SDPA -> Eager from the super() method.
+        # ModernBert's FA2 implementation correctly handles non-fp16/bf16 dtypes, we don't
+        # need the FA2 warning for non-fp16/bf16 dtypes so we set fp16 for the FA2 check.
         if config._attn_implementation_internal is None:
             config._attn_implementation_internal = "flash_attention_2"
             try:
                 return cls._check_and_enable_flash_attn_2(
                     config,
-                    torch_dtype=torch_dtype,
+                    torch_dtype=torch.float16,
                     device_map=device_map,
                     hard_check_only=False,
                     check_device_map=check_device_map,
@@ -867,7 +856,7 @@ def _autoset_attn_implementation(
         return super()._autoset_attn_implementation(
             config,
             use_flash_attention_2=use_flash_attention_2,
-            torch_dtype=torch_dtype,
+            torch_dtype=torch.float16,
             device_map=device_map,
             check_device_map=check_device_map,
         )
@@ -892,6 +881,14 @@ def _maybe_set_compile(self):
                 )
             self.config.reference_compile = False
 
+        if self.device.type == "cpu":
+            if self.config.reference_compile:
+                logger.warning_once(
+                    "Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. "
+                    "Falling back to non-compiled mode."
+                )
+            self.config.reference_compile = False
+
         if self.config.reference_compile is None:
             self.config.reference_compile = is_triton_available()
 
@@ -911,8 +908,8 @@ def resize_token_embeddings(self, *args, **kwargs):
 MODERNBERT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
+            Indices of input sequence tokens in the vocabulary. With Flash Attention 2.0, padding will be ignored
+            by default should you provide it.
 
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
@@ -929,9 +926,6 @@ def resize_token_embeddings(self, *args, **kwargs):
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
@@ -941,22 +935,26 @@ def resize_token_embeddings(self, *args, **kwargs):
         sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
             perform global attention, while the rest perform local attention. This mask is used to avoid attending to
-            far-away tokens in the local attention layers.
+            far-away tokens in the local attention layers when not using Flash Attention.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
         indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
             Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
         cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
             Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
         max_seqlen (`int`, *optional*):
-            Maximum sequence length in the batch. Used to pad the output tensors.
+            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
         batch_size (`int`, *optional*):
             Batch size of the input sequences. Used to pad the output tensors.
         seq_len (`int`, *optional*):
-            Sequence length of the input sequences. Used to pad the output tensors.
+            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -998,10 +996,11 @@ def set_input_embeddings(self, value):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
         max_seqlen: Optional[int] = None,
@@ -1017,35 +1016,49 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
         self._maybe_set_compile()
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+
+        if input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
 
         if batch_size is None and seq_len is None:
-            batch_size, seq_len = input_ids.shape[:2]
+            if inputs_embeds is not None:
+                batch_size, seq_len = inputs_embeds.shape[:2]
+            else:
+                batch_size, seq_len = input_ids.shape[:2]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
 
         if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.bool)
+            attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)
 
         repad = False
         if self.config._attn_implementation == "flash_attention_2":
             if indices is None and cu_seqlens is None and max_seqlen is None:
                 repad = True
-                with torch.no_grad():
-                    input_ids, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
-                        inputs=input_ids, attention_mask=attention_mask
+                if inputs_embeds is None:
+                    with torch.no_grad():
+                        input_ids, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
+                            inputs=input_ids, attention_mask=attention_mask
+                        )
+                else:
+                    inputs_embeds, indices, cu_seqlens, max_seqlen, *_ = _unpad_modernbert_input(
+                        inputs=inputs_embeds, attention_mask=attention_mask
                     )
         else:
             if position_ids is None:
-                position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
+                position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
 
             attention_mask, sliding_window_mask = self._update_attention_mask(
                 attention_mask, output_attentions=output_attentions
             )
 
-        hidden_states = self.embeddings(input_ids)
+        hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
 
         for encoder_layer in self.layers:
             if output_hidden_states:
@@ -1181,10 +1194,11 @@ def compiled_head(self, output: torch.Tensor) -> torch.Tensor:
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1201,19 +1215,32 @@ def forward(
 
         if self.config._attn_implementation == "flash_attention_2":
             if indices is None and cu_seqlens is None and max_seqlen is None:
-                batch_size, seq_len = input_ids.shape[:2]
+                if batch_size is None and seq_len is None:
+                    if inputs_embeds is not None:
+                        batch_size, seq_len = inputs_embeds.shape[:2]
+                    else:
+                        batch_size, seq_len = input_ids.shape[:2]
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+
                 if attention_mask is None:
-                    attention_mask = torch.ones((batch_size, seq_len), device=input_ids.device, dtype=torch.bool)
-                with torch.no_grad():
-                    input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
-                        inputs=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels
+                    attention_mask = torch.ones((batch_size, seq_len), device=device, dtype=torch.bool)
+
+                if inputs_embeds is None:
+                    with torch.no_grad():
+                        input_ids, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                            inputs=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels
+                        )
+                else:
+                    inputs_embeds, indices, cu_seqlens, max_seqlen, position_ids, labels = _unpad_modernbert_input(
+                        inputs=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, labels=labels
                     )
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
@@ -1246,8 +1273,9 @@ def forward(
             loss = self.loss_function(logits, labels, vocab_size=self.config.vocab_size)
 
         if self.config._attn_implementation == "flash_attention_2":
-            with torch.no_grad():
+            with nullcontext() if self.config.repad_logits_with_grad or labels is None else torch.no_grad():
                 logits = _pad_modernbert_output(inputs=logits, indices=indices, batch=batch_size, seqlen=seq_len)
+
         if not return_dict:
             output = (logits,)
             return ((loss,) + output) if loss is not None else output
@@ -1286,10 +1314,11 @@ def __init__(self, config: ModernBertConfig):
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1311,10 +1340,11 @@ def forward(
         self._maybe_set_compile()
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
@@ -1397,10 +1427,11 @@ def __init__(self, config: ModernBertConfig):
     )
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         sliding_window_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         indices: Optional[torch.Tensor] = None,
         cu_seqlens: Optional[torch.Tensor] = None,
@@ -1419,10 +1450,11 @@ def forward(
         self._maybe_set_compile()
 
         outputs = self.model(
-            input_ids,
+            input_ids=input_ids,
             attention_mask=attention_mask,
             sliding_window_mask=sliding_window_mask,
             position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
             indices=indices,
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
diff --git a/src/transformers/models/moonshine/__init__.py b/src/transformers/models/moonshine/__init__.py
new file mode 100644
index 00000000000000..fa3c1870c2a1d6
--- /dev/null
+++ b/src/transformers/models/moonshine/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_moonshine import *
+    from .modeling_moonshine import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py
new file mode 100644
index 00000000000000..cabbe9179ba8ef
--- /dev/null
+++ b/src/transformers/models/moonshine/configuration_moonshine.py
@@ -0,0 +1,224 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/moonshine/modular_moonshine.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_moonshine.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class MoonshineConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Moonshine
+    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MoonshineModel`].
+        hidden_size (`int`, *optional*, defaults to 288):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 1152):
+            Dimension of the MLP representations.
+        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        decoder_num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `decoder_num_attention_heads`.
+        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder.
+        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        decoder_start_token_id (`int`, *optional*, defaults to 1):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
+            Percentage of the query and keys which will have rotary embedding.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Denotes beginning of sequences token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            Denotes end of sequences token id.
+
+    Example:
+
+    ```python
+    >>> from transformers import MoonshineModel, MoonshineConfig
+
+    >>> # Initializing a Moonshine style configuration
+    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")
+
+    >>> # Initializing a model from the configuration
+    >>> model = MoonshineModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "moonshine"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_key_value_heads": "encoder_num_key_value_heads",
+        "num_attention_heads": "encoder_num_attention_heads",
+        "num_hidden_layers": "encoder_num_hidden_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=32768,
+        hidden_size=288,
+        intermediate_size=1152,
+        encoder_num_hidden_layers=6,
+        decoder_num_hidden_layers=6,
+        encoder_num_attention_heads=8,
+        decoder_num_attention_heads=8,
+        encoder_num_key_value_heads=None,
+        decoder_num_key_value_heads=None,
+        encoder_hidden_act="gelu",
+        decoder_hidden_act="silu",
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        decoder_start_token_id=1,
+        use_cache=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.9,
+        is_encoder_decoder=True,
+        attention_bias=False,
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.encoder_num_hidden_layers = encoder_num_hidden_layers
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.encoder_num_attention_heads = encoder_num_attention_heads
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+
+        if encoder_num_key_value_heads is None:
+            encoder_num_key_value_heads = encoder_num_attention_heads
+        self.encoder_num_key_value_heads = encoder_num_key_value_heads
+
+        if decoder_num_key_value_heads is None:
+            decoder_num_key_value_heads = decoder_num_attention_heads
+        self.decoder_num_key_value_heads = decoder_num_key_value_heads
+
+        self.encoder_hidden_act = encoder_hidden_act
+        self.decoder_hidden_act = decoder_hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.is_encoder_decoder = is_encoder_decoder
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["MoonshineConfig"]
diff --git a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
new file mode 100644
index 00000000000000..fa80f2b7096467
--- /dev/null
+++ b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
@@ -0,0 +1,169 @@
+# Copyright 2025 Useful Sensors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import re
+
+import h5py
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers.models.moonshine.modeling_moonshine import MoonshineConfig, MoonshineForConditionalGeneration
+
+
+# Copied from https://github.com/usefulsensors/moonshine/blob/a1d77cc573b0471ac4602b86f67b3f48d67df1a9/moonshine/model.py
+def _get_weights(model_name):
+    repo = "UsefulSensors/moonshine"
+
+    return (
+        hf_hub_download(repo, f"{x}.weights.h5", subfolder=model_name) for x in ("preprocessor", "encoder", "decoder")
+    )
+
+
+def _read_h5_weights(group, current_key="", weights={}):
+    for key in group.keys():
+        full_key = f"{current_key}.{key}" if current_key else key
+        if isinstance(group[key], h5py.Dataset):
+            w = np.array(group[key])
+            w = torch.from_numpy(w)
+            if len(w.shape) > 1:
+                if len(w.shape) == 3:
+                    hidden_size = max(list(w.shape))
+                    try:
+                        w = w.reshape(hidden_size, hidden_size)
+                    except RuntimeError:
+                        # meaning its a conv layers
+                        pass
+                w = w.transpose(0, -1)
+            weights[full_key] = w
+        else:
+            _read_h5_weights(group[key], full_key, weights)
+    return weights
+
+
+def _convert_layer_names(name, gated_mlp=False):
+    name = re.sub(
+        r"layers\.functional(?:_(\d+))?\.layers",
+        lambda m: f'layers.{m.group(1) if m.group(1) else "0"}',
+        name,
+        count=1,
+    )
+    if gated_mlp:
+        name = re.sub(r"functional\.layers\.dense\.", "mlp.fc1.", name)
+        name = re.sub(r"functional\.layers\.dense_1\.", "mlp.fc2.", name)
+    else:
+        name = re.sub(r"functional\.layers\.sequential\.layers\.dense\.", "mlp.fc1.", name)
+        name = re.sub(r"functional\.layers\.sequential\.layers\.dense_1\.", "mlp.fc2.", name)
+    name = re.sub(r"layers\.sequential\.layers\.conv1d\.", "conv1.", name)
+    name = re.sub(r"layers\.sequential\.layers\.conv1d_1\.", "conv2.", name)
+    name = re.sub(r"layers\.sequential\.layers\.conv1d_2\.", "conv3.", name)
+    name = re.sub(r"layers\.sequential\.layers\.group_normalization\.", "groupnorm.", name)
+    name = re.sub(r"mha_with_rope\.key_dense", "self_attn.k_proj", name)
+    name = re.sub(r"mha_with_rope\.query_dense", "self_attn.q_proj", name)
+    name = re.sub(r"mha_with_rope\.value_dense", "self_attn.v_proj", name)
+    name = re.sub(r"mha_with_rope\.output_dense", "self_attn.o_proj", name)
+    name = re.sub(r"mha_precomputed_kv\.key_dense", "encoder_attn.k_proj", name)
+    name = re.sub(r"mha_precomputed_kv\.query_dense", "encoder_attn.q_proj", name)
+    name = re.sub(r"mha_precomputed_kv\.value_dense", "encoder_attn.v_proj", name)
+    name = re.sub(r"mha_precomputed_kv\.output_dense", "encoder_attn.o_proj", name)
+    name = re.sub(r"mha_causal_with_rope\.key_dense", "self_attn.k_proj", name)
+    name = re.sub(r"mha_causal_with_rope\.query_dense", "self_attn.q_proj", name)
+    name = re.sub(r"mha_causal_with_rope\.value_dense", "self_attn.v_proj", name)
+    name = re.sub(r"mha_causal_with_rope\.output_dense", "self_attn.o_proj", name)
+    name = re.sub(r"layer_normalization\.", "input_layernorm.", name)
+    name = re.sub(r"layer_normalization_1\.", "post_attention_layernorm.", name)
+    name = re.sub(r"layer_normalization_2\.", "final_layernorm.", name)
+    name = re.sub(r"vars\.0", "weight", name)
+    name = re.sub(r"vars\.1", "bias", name)
+    name = re.sub(r"layers\.reversible_embedding", "embed_tokens", name)
+
+    return name
+
+
+def _convert_weights(weights, encoder=True):
+    if "layers.rotary_embedding.vars.0" in weights:
+        weights.pop("layers.rotary_embedding.vars.0")
+
+    converted_weights = {}
+    if encoder:
+        converted_weights["layer_norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
+    else:
+        converted_weights["norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
+
+    for name, w in weights.items():
+        if encoder:
+            new_name = _convert_layer_names(name)
+        else:
+            new_name = _convert_layer_names(name, gated_mlp=True)
+        converted_weights[new_name] = w
+
+    return converted_weights
+
+
+def convert_usefulsensors_moonshine_to_hf(model_name, pytorch_dump_folder_path):
+    preprocessor_weights_path, encoder_weights_path, decoder_weights_path = _get_weights(model_name)
+
+    with h5py.File(preprocessor_weights_path, "r") as f:
+        loaded_preprocessor_weights = _read_h5_weights(f, weights={})
+
+    with h5py.File(encoder_weights_path, "r") as f:
+        loaded_encoder_weights = _read_h5_weights(f, weights={})
+
+    with h5py.File(decoder_weights_path, "r") as f:
+        loaded_decoder_weights = _read_h5_weights(f, weights={})
+
+    encoder_state_dict = {**loaded_encoder_weights, **loaded_preprocessor_weights}
+    converted_encoder_state_dict = _convert_weights(encoder_state_dict)
+
+    converted_decoder_state_dict = _convert_weights(loaded_decoder_weights, encoder=False)
+    converted_decoder_state_dict["embed_tokens.weight"] = converted_decoder_state_dict["embed_tokens.weight"].T
+
+    final_weights = {}
+    for k, v in converted_encoder_state_dict.items():
+        final_weights[f"model.encoder.{k}"] = v
+
+    for k, v in converted_decoder_state_dict.items():
+        final_weights[f"model.decoder.{k}"] = v
+
+    if model_name == "tiny":
+        config = MoonshineConfig()
+    elif model_name == "base":
+        config = MoonshineConfig(
+            hidden_size=416,
+            intermediate_size=1664,
+            encoder_num_hidden_layers=8,
+            decoder_num_hidden_layers=8,
+            encoder_num_attention_heads=8,
+            decoder_num_attention_heads=8,
+            partial_rotary_factor=0.62,
+        )
+    else:
+        raise ValueError(f"Unknown model name {model_name}")
+
+    final_weights["proj_out.weight"] = converted_decoder_state_dict["embed_tokens.weight"]
+
+    model = MoonshineForConditionalGeneration(config)
+    model.load_state_dict(final_weights)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # # Required parameters
+    parser.add_argument("--model_name", type=str, help="Path to the downloaded checkpoints")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+
+    convert_usefulsensors_moonshine_to_hf(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py
new file mode 100644
index 00000000000000..50894e3cff44fb
--- /dev/null
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@@ -0,0 +1,1573 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/moonshine/modular_moonshine.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_moonshine.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_moonshine import MoonshineConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MoonshineConfig"
+
+
+class MoonshineEncoderMLP(nn.Module):
+    def __init__(self, config, hidden_act):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MoonshineDecoderMLP(nn.Module):
+    def __init__(self, config, hidden_act):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size * 2)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation_fn(gate) * hidden_states
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class MoonshineAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: MoonshineConfig,
+        layer_idx: int,
+        is_causal: bool,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+    ):
+        super().__init__()
+        config.update({"num_attention_heads": num_attention_heads, "num_key_value_heads": num_key_value_heads})
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = is_causal
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len = hidden_states.shape[:-1]
+
+        query_states = (
+            self.q_proj(hidden_states).view(bsz, q_len, self.config.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+
+        is_cross_attention = key_value_states is not None
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = (
+                self.k_proj(current_states)
+                .view(bsz, -1, self.config.num_key_value_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(current_states)
+                .view(bsz, -1, self.config.num_key_value_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            if is_cross_attention and past_key_value is not None:
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        if not is_cross_attention:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            is_causal=is_causal,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class MoonshineRotaryEmbedding(nn.Module):
+    def __init__(self, config: MoonshineConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class MoonshineEncoderLayer(nn.Module):
+    def __init__(self, config: MoonshineConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=False,
+            num_attention_heads=config.encoder_num_attention_heads,
+            num_key_value_heads=config.encoder_num_key_value_heads,
+        )
+
+        self.mlp = MoonshineEncoderMLP(config, config.encoder_hidden_act)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class MoonshineDecoderLayer(nn.Module):
+    def __init__(self, config: MoonshineConfig, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=True,
+            num_attention_heads=config.decoder_num_attention_heads,
+            num_key_value_heads=config.decoder_num_key_value_heads,
+        )
+        self.encoder_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=False,
+            num_attention_heads=config.decoder_num_attention_heads,
+            num_key_value_heads=config.decoder_num_key_value_heads,
+        )
+
+        self.mlp = MoonshineDecoderMLP(config, config.decoder_hidden_act)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        encoder_position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        encoder_position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+MOONSHINE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MoonshineConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Moonshine Model outputting raw hidden-states without any specific head on top.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshinePreTrainedModel(PreTrainedModel):
+    config_class = MoonshineConfig
+    base_model_prefix = "model"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MoonshineEncoderLayer", "MoonshineDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        output_conv1_length = int((input_lengths - 127) / 64 + 1)
+        output_conv2_length = int((output_conv1_length - 7) / 3 + 1)
+        output_conv3_length = int((output_conv2_length - 3) / 2 + 1)
+
+        return output_conv3_length
+
+
+class MoonshineEncoder(MoonshinePreTrainedModel):
+    """
+    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]
+
+    Args:
+        config: MoonshineConfig
+    """
+
+    main_input_name = "input_values"
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.conv1 = nn.Conv1d(1, embed_dim, kernel_size=127, stride=64, bias=False)
+        self.conv2 = nn.Conv1d(embed_dim, 2 * embed_dim, kernel_size=7, stride=3)
+        self.conv3 = nn.Conv1d(2 * embed_dim, embed_dim, kernel_size=3, stride=2)
+        self.groupnorm = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=1e-5)
+
+        self.rotary_emb = MoonshineRotaryEmbedding(config=config)
+
+        self.layers = nn.ModuleList(
+            [MoonshineEncoderLayer(config, idx) for idx in range(config.encoder_num_hidden_layers)]
+        )
+        self.layer_norm = nn.LayerNorm(embed_dim, bias=False)
+
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+                Float values of the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
+                and conversion into a tensor of type `torch.FloatTensor`.
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Moonshine does not support masking of the `input_values`, this argument is preserved for compatibility,
+                but it is not used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_values is None:
+            raise ValueError("You must specify input_values.")
+
+        # conv downsampling
+        input_values = input_values.unsqueeze(1)
+        hidden_states = nn.functional.tanh(self.conv1(input_values))
+        hidden_states = self.groupnorm(hidden_states)
+        hidden_states = nn.functional.gelu(self.conv2(hidden_states))
+        hidden_states = nn.functional.gelu(self.conv3(hidden_states))
+        hidden_states = hidden_states.permute(0, 2, 1)
+
+        position_ids = torch.arange(0, hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # encoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    None,
+                    position_ids,
+                    None,
+                    output_attentions,
+                    False,
+                    None,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    position_ids=position_ids,
+                    output_attentions=output_attentions,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last encoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+
+MOONSHINE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Moonshine Model outputting raw hidden-states without any specific head on top.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshineDecoder(MoonshinePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineDecoderLayer`]
+
+    Args:
+        config: MoonshineConfig
+    """
+
+    main_input_name = "input_ids"
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MoonshineDecoderLayer(config, idx) for idx in range(config.decoder_num_hidden_layers)]
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.rotary_emb = MoonshineRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MOONSHINE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            self_attention_cache = DynamicCache()
+            cross_attention_cache = DynamicCache()
+            past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    encoder_hidden_states,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+MOONSHINE_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+            Float values of the raw speech waveform. Raw speech waveform can be
+            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
+            and conversion into a tensor of type `torch.FloatTensor`.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor`)`, *optional*):
+            Moonshine does not support masking of the `input_values`, this argument is preserved for compatibility,
+            but it is not used.
+        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids`
+            of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `decoder_position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Moonshine Model outputting raw hidden-states without any specific head on top.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshineModel(MoonshinePreTrainedModel):
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+
+        self.encoder = MoonshineEncoder(config)
+        self.decoder = MoonshineDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
+        not be updated during training.
+        """
+        self.encoder._freeze_parameters()
+
+    def _mask_input_features(
+        self,
+        input_features: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return input_features
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, hidden_size, sequence_length = input_features.size()
+
+        if self.config.mask_time_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=input_features.device, dtype=torch.bool)
+            mask_time_indices = mask_time_indices[:, None].expand(-1, hidden_size, -1)
+            input_features[mask_time_indices] = 0
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=input_features.device, dtype=torch.bool)
+            input_features[mask_feature_indices] = 0
+
+        return input_features
+
+    @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, MoonshineModel
+        >>> from datasets import load_dataset
+
+        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_values = inputs.input_values
+        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
+        >>> list(last_hidden_state.shape)
+        [1, 2, 288]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+@add_start_docstrings(
+    "The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["proj_out.weight"]
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.model = MoonshineModel(config)
+        self.proj_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.proj_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
+    @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_values = inputs.input_values
+
+        >>> generated_ids = model.generate(input_values, max_new_tokens=100)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_values,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            decoder_position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        logits = self.proj_out(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def generate(self, *args, **kwargs):
+        # TODO: @eustlb do it rather with a custom logits processor
+        token_limit_factor = 6.5 / 16000.0  # Maximum of 6.5 tokens per second
+        if kwargs.get("max_new_tokens") is None and kwargs.get("max_length") is None:
+            if kwargs.get("attention_mask") is not None:
+                seq_lens = kwargs["attention_mask"].sum(dim=-1)
+            else:
+                seq_lens = kwargs["input_values"].shape[-1]
+            max_length = int(seq_lens.max().item() * token_limit_factor)
+            logger.warning_once(
+                f"Based on the input length, Moonshine will generate up to {max_length} tokens (ratio of 6.5 tokens/second). "
+                "To specify a different length, set either `max_new_tokens` or `max_length`."
+            )
+            kwargs["max_length"] = max_length
+
+        return super().generate(*args, **kwargs)
+
+
+__all__ = ["MoonshineModel", "MoonshinePreTrainedModel", "MoonshineForConditionalGeneration"]
diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py
new file mode 100644
index 00000000000000..e3d4a7db703f41
--- /dev/null
+++ b/src/transformers/models/moonshine/modular_moonshine.py
@@ -0,0 +1,1135 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..glm.modeling_glm import GlmAttention, GlmRotaryEmbedding, apply_rotary_pos_emb
+from ..llama.modeling_llama import LlamaDecoderLayer, LlamaModel, eager_attention_forward
+from ..whisper.modeling_whisper import WhisperModel, shift_tokens_right
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "UsefulSensors/moonshine-tiny"
+_CONFIG_FOR_DOC = "MoonshineConfig"
+
+
+class MoonshineConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Moonshine
+    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MoonshineModel`].
+        hidden_size (`int`, *optional*, defaults to 288):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 1152):
+            Dimension of the MLP representations.
+        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        decoder_num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `decoder_num_attention_heads`.
+        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder.
+        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        decoder_start_token_id (`int`, *optional*, defaults to 1):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
+            Percentage of the query and keys which will have rotary embedding.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Denotes beginning of sequences token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            Denotes end of sequences token id.
+
+    Example:
+
+    ```python
+    >>> from transformers import MoonshineModel, MoonshineConfig
+
+    >>> # Initializing a Moonshine style configuration
+    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")
+
+    >>> # Initializing a model from the configuration
+    >>> model = MoonshineModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "moonshine"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_key_value_heads": "encoder_num_key_value_heads",
+        "num_attention_heads": "encoder_num_attention_heads",
+        "num_hidden_layers": "encoder_num_hidden_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=32768,
+        hidden_size=288,
+        intermediate_size=1152,
+        encoder_num_hidden_layers=6,
+        decoder_num_hidden_layers=6,
+        encoder_num_attention_heads=8,
+        decoder_num_attention_heads=8,
+        encoder_num_key_value_heads=None,
+        decoder_num_key_value_heads=None,
+        encoder_hidden_act="gelu",
+        decoder_hidden_act="silu",
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        decoder_start_token_id=1,
+        use_cache=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.9,
+        is_encoder_decoder=True,
+        attention_bias=False,
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.encoder_num_hidden_layers = encoder_num_hidden_layers
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.encoder_num_attention_heads = encoder_num_attention_heads
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+
+        if encoder_num_key_value_heads is None:
+            encoder_num_key_value_heads = encoder_num_attention_heads
+        self.encoder_num_key_value_heads = encoder_num_key_value_heads
+
+        if decoder_num_key_value_heads is None:
+            decoder_num_key_value_heads = decoder_num_attention_heads
+        self.decoder_num_key_value_heads = decoder_num_key_value_heads
+
+        self.encoder_hidden_act = encoder_hidden_act
+        self.decoder_hidden_act = decoder_hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.is_encoder_decoder = is_encoder_decoder
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+class MoonshineEncoderMLP(nn.Module):
+    def __init__(self, config, hidden_act):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MoonshineDecoderMLP(nn.Module):
+    def __init__(self, config, hidden_act):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size * 2)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation_fn(gate) * hidden_states
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MoonshineAttention(GlmAttention):
+    def __init__(
+        self,
+        config: MoonshineConfig,
+        layer_idx: int,
+        is_causal: bool,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+    ):
+        config.update({"num_attention_heads": num_attention_heads, "num_key_value_heads": num_key_value_heads})
+        super().__init__(config, layer_idx)
+        self.is_causal = is_causal
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len = hidden_states.shape[:-1]
+
+        query_states = (
+            self.q_proj(hidden_states).view(bsz, q_len, self.config.num_key_value_heads, self.head_dim).transpose(1, 2)
+        )
+
+        is_cross_attention = key_value_states is not None
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = (
+                self.k_proj(current_states)
+                .view(bsz, -1, self.config.num_key_value_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(current_states)
+                .view(bsz, -1, self.config.num_key_value_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            if is_cross_attention and past_key_value is not None:
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        if not is_cross_attention:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
+                )
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            is_causal=is_causal,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class MoonshineRotaryEmbedding(GlmRotaryEmbedding):
+    pass
+
+
+class MoonshineEncoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: MoonshineConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.self_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=False,
+            num_attention_heads=config.encoder_num_attention_heads,
+            num_key_value_heads=config.encoder_num_key_value_heads,
+        )
+
+        self.mlp = MoonshineEncoderMLP(config, config.encoder_hidden_act)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+
+
+class MoonshineDecoderLayer(nn.Module):
+    def __init__(self, config: MoonshineConfig, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=True,
+            num_attention_heads=config.decoder_num_attention_heads,
+            num_key_value_heads=config.decoder_num_key_value_heads,
+        )
+        self.encoder_attn = MoonshineAttention(
+            config=config,
+            layer_idx=layer_idx,
+            is_causal=False,
+            num_attention_heads=config.decoder_num_attention_heads,
+            num_key_value_heads=config.decoder_num_key_value_heads,
+        )
+
+        self.mlp = MoonshineDecoderMLP(config, config.decoder_hidden_act)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        encoder_position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        encoder_position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+MOONSHINE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MoonshineConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Moonshine Model outputting raw hidden-states without any specific head on top.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshinePreTrainedModel(PreTrainedModel):
+    config_class = MoonshineConfig
+    base_model_prefix = "model"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MoonshineEncoderLayer", "MoonshineDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        output_conv1_length = int((input_lengths - 127) / 64 + 1)
+        output_conv2_length = int((output_conv1_length - 7) / 3 + 1)
+        output_conv3_length = int((output_conv2_length - 3) / 2 + 1)
+
+        return output_conv3_length
+
+
+class MoonshineEncoder(MoonshinePreTrainedModel):
+    """
+    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]
+
+    Args:
+        config: MoonshineConfig
+    """
+
+    main_input_name = "input_values"
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.conv1 = nn.Conv1d(1, embed_dim, kernel_size=127, stride=64, bias=False)
+        self.conv2 = nn.Conv1d(embed_dim, 2 * embed_dim, kernel_size=7, stride=3)
+        self.conv3 = nn.Conv1d(2 * embed_dim, embed_dim, kernel_size=3, stride=2)
+        self.groupnorm = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=1e-5)
+
+        self.rotary_emb = MoonshineRotaryEmbedding(config=config)
+
+        self.layers = nn.ModuleList(
+            [MoonshineEncoderLayer(config, idx) for idx in range(config.encoder_num_hidden_layers)]
+        )
+        self.layer_norm = nn.LayerNorm(embed_dim, bias=False)
+
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+                Float values of the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
+                and conversion into a tensor of type `torch.FloatTensor`.
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Moonshine does not support masking of the `input_values`, this argument is preserved for compatibility,
+                but it is not used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_values is None:
+            raise ValueError("You must specify input_values.")
+
+        # conv downsampling
+        input_values = input_values.unsqueeze(1)
+        hidden_states = nn.functional.tanh(self.conv1(input_values))
+        hidden_states = self.groupnorm(hidden_states)
+        hidden_states = nn.functional.gelu(self.conv2(hidden_states))
+        hidden_states = nn.functional.gelu(self.conv3(hidden_states))
+        hidden_states = hidden_states.permute(0, 2, 1)
+
+        position_ids = torch.arange(0, hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # encoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    None,
+                    position_ids,
+                    None,
+                    output_attentions,
+                    False,
+                    None,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    position_ids=position_ids,
+                    output_attentions=output_attentions,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last encoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+
+class MoonshineDecoder(LlamaModel):
+    main_input_name = "input_ids"
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.norm = nn.LayerNorm(config.hidden_size, bias=False)
+        self.layers = nn.ModuleList(
+            [MoonshineDecoderLayer(config, idx) for idx in range(config.decoder_num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            self_attention_cache = DynamicCache()
+            cross_attention_cache = DynamicCache()
+            past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    encoder_hidden_states,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+        return output if return_dict else output.to_tuple()
+
+
+MOONSHINE_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
+            Float values of the raw speech waveform. Raw speech waveform can be
+            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
+            and conversion into a tensor of type `torch.FloatTensor`.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor`)`, *optional*):
+            Moonshine does not support masking of the `input_values`, this argument is preserved for compatibility,
+            but it is not used.
+        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids`
+            of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `decoder_position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Moonshine Model outputting raw hidden-states without any specific head on top.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshineModel(WhisperModel):
+    @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, MoonshineModel
+        >>> from datasets import load_dataset
+
+        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_values = inputs.input_values
+        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
+        >>> list(last_hidden_state.shape)
+        [1, 2, 288]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.",
+    MOONSHINE_START_DOCSTRING,
+)
+class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["proj_out.weight"]
+
+    def __init__(self, config: MoonshineConfig):
+        super().__init__(config)
+        self.model = MoonshineModel(config)
+        self.proj_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.proj_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
+    @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
+        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_values = inputs.input_values
+
+        >>> generated_ids = model.generate(input_values, max_new_tokens=100)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_values,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            decoder_position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        logits = self.proj_out(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def generate(self, *args, **kwargs):
+        # TODO: @eustlb do it rather with a custom logits processor
+        token_limit_factor = 6.5 / 16000.0  # Maximum of 6.5 tokens per second
+        if kwargs.get("max_new_tokens") is None and kwargs.get("max_length") is None:
+            if kwargs.get("attention_mask") is not None:
+                seq_lens = kwargs["attention_mask"].sum(dim=-1)
+            else:
+                seq_lens = kwargs["input_values"].shape[-1]
+            max_length = int(seq_lens.max().item() * token_limit_factor)
+            logger.warning_once(
+                f"Based on the input length, Moonshine will generate up to {max_length} tokens (ratio of 6.5 tokens/second). "
+                "To specify a different length, set either `max_new_tokens` or `max_length`."
+            )
+            kwargs["max_length"] = max_length
+
+        return super().generate(*args, **kwargs)
+
+
+__all__ = [
+    "MoonshineConfig",
+    "MoonshineModel",
+    "MoonshinePreTrainedModel",
+    "MoonshineForConditionalGeneration",
+]
diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py
index f0281f57cf1c75..a1c15b7a0b3775 100644
--- a/src/transformers/models/moshi/modeling_moshi.py
+++ b/src/transformers/models/moshi/modeling_moshi.py
@@ -308,13 +308,8 @@ def forward(self, x, layer_idx=None):
 
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Moshi
 class MoshiRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: MoshiConfig,
-        device=None,
-    ):
+    def __init__(self, config: MoshiConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -326,7 +321,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -338,13 +333,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -1292,7 +1288,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Moshi
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1302,6 +1298,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Moshi. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1596,7 +1600,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Moshi
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1606,6 +1610,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Moshi. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1902,12 +1914,9 @@ def __init__(self, config: MoshiConfig):
         self.embed_tokens = nn.ModuleList(
             [nn.Embedding(config.audio_vocab_size + 1, config.hidden_size) for _ in range(2 * config.num_codebooks)]
         )
-        self.audio_encoder = AutoModel.from_config(
-            config.audio_encoder_config, attn_implementation=config._attn_implementation
-        )
+        self.audio_encoder = AutoModel.from_config(config.audio_encoder_config)
         self.decoder = MoshiForCausalLM(config)
 
-        config.depth_decoder_config._attn_implementation_internal = config._attn_implementation
         self.depth_decoder = MoshiDepthDecoder(config.depth_decoder_config)
 
         self.num_codebooks = config.num_codebooks
diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py
index a66b4e178e8ae4..ae226812c83a3f 100644
--- a/src/transformers/models/mvp/tokenization_mvp_fast.py
+++ b/src/transformers/models/mvp/tokenization_mvp_fast.py
@@ -16,7 +16,7 @@
 import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -160,14 +160,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index b72bf5aaa63aca..54f774f0b942c5 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -98,7 +98,6 @@ def __init__(
         self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config
-        self.rope_kwargs = None
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
@@ -113,13 +112,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index 00a9d069d10ee4..2cbd6e67294ef0 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -463,7 +463,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 39bfa726deeedf..34e9f7259cdfff 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -274,13 +274,8 @@ def forward(
 
 
 class OlmoRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: OlmoConfig,
-        device=None,
-    ):
+    def __init__(self, config: OlmoConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -292,7 +287,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -304,13 +299,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -367,6 +363,7 @@ class OlmoPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
index 43837fc14c2561..d804e7a159e400 100644
--- a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
+++ b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 import argparse
 import gc
 import json
diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py
index 89b5f4abe1c39c..a6a19265015b6e 100644
--- a/src/transformers/models/olmo2/modeling_olmo2.py
+++ b/src/transformers/models/olmo2/modeling_olmo2.py
@@ -275,13 +275,8 @@ def forward(
 
 
 class Olmo2RotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: Olmo2Config,
-        device=None,
-    ):
+    def __init__(self, config: Olmo2Config, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -293,7 +288,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -305,13 +300,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -368,6 +364,7 @@ class Olmo2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index fb341469e7a5d7..5c78138c1a0351 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -158,13 +158,8 @@ def extra_repr(self):
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
 class OlmoeRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: OlmoeConfig,
-        device=None,
-    ):
+    def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -176,7 +171,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -188,13 +183,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -769,6 +765,7 @@ class OlmoePreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 0988e1fecaa5be..f931ab31092c47 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -617,7 +617,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index f21c73ce9c22c9..ac637d62dd6171 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -424,7 +424,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 1d6fc69189fb36..05c2fb908c0b6d 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -373,7 +373,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index f6b8f2ac46ec24..36a9e59118b678 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -196,7 +196,6 @@ class PaliGemmaPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
-    _supports_cache_class = True
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
@@ -336,10 +335,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights with Llava->PaliGemma
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def _update_causal_mask(
         self,
         attention_mask,
diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py
index b96bea73881b5c..dab5ed6b078cf3 100644
--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -307,7 +307,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 8d8e63e8c922c9..8336ab5a2cf529 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -57,13 +57,8 @@
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Persimmon
 class PersimmonRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: PersimmonConfig,
-        device=None,
-    ):
+    def __init__(self, config: PersimmonConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -75,7 +70,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -87,13 +82,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 5aa038d3ccfaa8..c81fbbd013a86a 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -270,13 +270,8 @@ def forward(
 
 
 class PhiRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: PhiConfig,
-        device=None,
-    ):
+    def __init__(self, config: PhiConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -288,7 +283,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -300,13 +295,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -363,6 +359,7 @@ class PhiPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -730,7 +727,7 @@ def __init__(self, config):
         super().__init__(config)
         self.model = PhiModel(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py
index 0faa4629f1a768..d8480a7ad61ab4 100644
--- a/src/transformers/models/phi/modular_phi.py
+++ b/src/transformers/models/phi/modular_phi.py
@@ -284,7 +284,9 @@ def forward(
 
 
 class PhiForCausalLM(LlamaForCausalLM):
-    pass
+    def __init__(self, config):
+        super().__init__(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
 
 
 class PhiForSequenceClassification(LlamaForSequenceClassification):
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 2ea105e72f6969..cf905cb62e9000 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1,3 +1,9 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi3/modular_phi3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
@@ -13,34 +19,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""PyTorch Phi-3 model."""
 
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
 from ...utils import (
+    LossKwargs,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -53,186 +56,24 @@
 _CONFIG_FOR_DOC = "Phi3Config"
 
 
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
-class Phi3RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Phi3RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-# copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
-# TODO cyril: modular
-class Phi3RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
         super().__init__()
 
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        self.inv_freq.to(x.device)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        warnings.warn(
-            "The class Phi3SuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please"
-            " use Phi3LongRoPEScaledRotaryEmbedding instead.",
-            FutureWarning,
-        )
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        warnings.warn(
-            "The class Phi3YarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers",
-            FutureWarning,
-        )
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = 0.1 * math.log(scale) + 1.0
-
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        seq_len = seq_len or torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
 
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
 
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
 
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+        return self.down_proj(up_states)
 
 
-# Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -240,7 +81,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -268,26 +108,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-class Phi3MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.config = config
-        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
-
-        self.activation_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        up_states = self.gate_up_proj(hidden_states)
-
-        gate, up_states = up_states.chunk(2, dim=-1)
-        up_states = up_states * self.activation_fn(gate)
-
-        return self.down_proj(up_states)
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -300,6 +120,32 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class Phi3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -307,374 +153,117 @@ def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.original_max_position_embeddings = config.original_max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = config.rope_scaling
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
         self.is_causal = True
 
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.rope_scaling is None:
-            self.rotary_emb = Phi3RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            if scaling_type == "longrope":
-                self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config)
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+        op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
         past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
-
-        bsz, q_len, _ = hidden_states.size()
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
 
         qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
+        query_pos = self.config.num_attention_heads * self.head_dim
         query_states = qkv[..., :query_pos]
         key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
         value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights += causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Phi3FlashAttention2(Phi3Attention):
-    """
-    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # Phi3FlashAttention2 attention does not support output_attentions
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
-        query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = (
-            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
-        )
-
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len, position_ids=position_ids)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_dropout = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        if query_states.dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
             else:
-                target_dtype = self.qkv_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        attn_output = _flash_attention_forward(
+        attn_output, attn_weights = attention_interface(
+            self,
             query_states,
             key_states,
             value_states,
             attention_mask,
-            q_len,
-            position_ids=position_ids,
-            dropout=attn_dropout,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
             sliding_window=getattr(self.config, "sliding_window", None),
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=self.is_causal,
+            **kwargs,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
 
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# NO LONGER EXIST copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
-# TODO cyril: modular
-class Phi3SdpaAttention(Phi3Attention):
-    """
-    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Phi3Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv = self.qkv_proj(hidden_states)
-        query_pos = self.num_heads * self.head_dim
-        query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
 
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
 
-PHI3_ATTENTION_CLASSES = {
-    "eager": Phi3Attention,
-    "flash_attention_2": Phi3FlashAttention2,
-    "sdpa": Phi3SdpaAttention,
-}
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
 class Phi3DecoderLayer(nn.Module):
     def __init__(self, config: Phi3Config, layer_idx: int):
         super().__init__()
-
-        self.config = config
-        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
-
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx)
         self.mlp = Phi3MLP(config)
         self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.config = config
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -685,26 +274,25 @@ def forward(
             position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                 Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                 `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            past_key_value (`Cache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
             kwargs (`dict`, *optional*):
                 Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                 into the model
         """
-
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
-        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -712,26 +300,105 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
         )
-
-        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        hidden_states = residual + self.resid_attn_dropout(hidden_states)  # main diff with Llama
 
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)  # main diff with Llama
 
         outputs = (hidden_states,)
-
         if output_attentions:
             outputs += (self_attn_weights,)
 
-        if use_cache:
-            outputs += (present_key_value,)
-
         return outputs
 
 
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Phi3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        elif self.rope_type == "longrope":
+            self._longrope_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+    def _longrope_frequency_update(self, position_ids, device):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = torch.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(
+                    self.config, device, seq_len=original_max_position_embeddings + 1
+                )
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+
 PHI3_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -750,7 +417,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    "The bare Phi3 Model outputting raw hidden-states without any specific head on top.",
     PHI3_START_DOCSTRING,
 )
 class Phi3PreTrainedModel(PreTrainedModel):
@@ -758,11 +425,13 @@ class Phi3PreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Phi3DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
+    _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
-
+    _supports_quantized_cache = True
+    _supports_static_cache = True
     _version = "0.0.5"
 
     def _init_weights(self, module):
@@ -853,7 +522,7 @@ def _init_weights(self, module):
 
 
 @add_start_docstrings(
-    "The bare Phi-3 model outputting raw hidden-states without any specific head on top.",
+    "The bare Phi3 Model outputting raw hidden-states without any specific head on top.",
     PHI3_START_DOCSTRING,
 )
 class Phi3Model(Phi3PreTrainedModel):
@@ -870,14 +539,13 @@ def __init__(self, config: Phi3Config):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.embed_dropout = nn.Dropout(config.embd_pdrop)
         self.layers = nn.ModuleList(
             [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self._attn_implementation = config._attn_implementation
         self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
+        self.rotary_emb = Phi3RotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -893,54 +561,43 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # kept for BC (non `Cache` `past_key_values` inputs)
-        return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache):
-            return_legacy_cache = True
-            if past_key_values is None:
-                past_key_values = DynamicCache()
-            else:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
-                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
-                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
-                )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
+
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
@@ -950,12 +607,14 @@ def forward(
 
         hidden_states = inputs_embeds
 
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -969,6 +628,7 @@ def forward(
                     output_attentions,
                     use_cache,
                     cache_position,
+                    position_embeddings,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -979,13 +639,12 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
                 )
 
             hidden_states = layer_outputs[0]
 
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
@@ -995,18 +654,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
-        if return_legacy_cache:
-            next_cache = next_cache.to_legacy_cache()
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
+        output = BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
+            past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
+        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -1017,6 +671,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1084,7 +746,6 @@ def _update_causal_mask(
         return causal_mask
 
     @staticmethod
-    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Phi3
     def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor,
         sequence_length: int,
@@ -1152,10 +813,13 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
 class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
     def __init__(self, config):
         super().__init__(config)
         self.model = Phi3Model(config)
@@ -1165,31 +829,24 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
     def get_input_embeddings(self):
         return self.model.embed_tokens
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
     def get_output_embeddings(self):
         return self.lm_head
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
     def set_decoder(self, decoder):
         self.model = decoder
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
     def get_decoder(self):
         return self.model
 
-    # Ignore copy
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1197,7 +854,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1206,7 +863,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
+        **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1227,27 +884,17 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, Phi3ForCausalLM
 
-        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")
 
-        >>> prompt = "This is an example script ."
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-        if (
-            use_cache
-            and self.config.rope_scaling
-            and cache_position is not None
-            and cache_position[0] == self.config.original_max_position_embeddings
-        ):
-            logger.warning(
-                f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
-            )
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1265,6 +912,8 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
         )
 
         hidden_states = outputs[0]
@@ -1273,7 +922,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1329,7 +978,7 @@ def prepare_inputs_for_generation(
 
 @add_start_docstrings(
     """
-    The [`Phi3Model`] with a sequence classification head on top (linear layer).
+    The Phi3 Model transformer with a sequence classification head on top (linear layer).
 
     [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
@@ -1342,7 +991,6 @@ def prepare_inputs_for_generation(
     """,
     PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
 class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1381,7 +1029,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        model_outputs = self.model(
+        transformer_outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1392,7 +1040,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        hidden_states = model_outputs[0]
+        hidden_states = transformer_outputs[0]
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1420,44 +1068,48 @@ def forward(
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
         if not return_dict:
-            output = (pooled_logits,) + model_outputs[1:]
+            output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
-            past_key_values=model_outputs.past_key_values,
-            hidden_states=model_outputs.hidden_states,
-            attentions=model_outputs.attentions,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
         )
 
 
 @add_start_docstrings(
     """
-    [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
+    The Phi3 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
     """,
     PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
 class Phi3ForTokenClassification(Phi3PreTrainedModel):
-    def __init__(self, config: Phi3Config):
+    def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-
         self.model = Phi3Model(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+        if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+        elif getattr(config, "hidden_dropout", None) is not None:
             classifier_dropout = config.hidden_dropout
         else:
             classifier_dropout = 0.1
         self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1467,16 +1119,16 @@ def __init__(self, config: Phi3Config):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+    ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -1485,40 +1137,34 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        model_outputs = self.model(
+        outputs = self.model(
             input_ids,
-            past_key_values=past_key_values,
             attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-
-        hidden_states = model_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
 
         loss = None
         if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            batch_size, seq_length = labels.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
-            )
+            loss = self.loss_function(logits, labels, self.config)
 
         if not return_dict:
-            output = (logits,) + model_outputs[2:]
+            output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
-            hidden_states=model_outputs.hidden_states,
-            attentions=model_outputs.attentions,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 
 
diff --git a/src/transformers/models/phi3/modular_phi3.py b/src/transformers/models/phi3/modular_phi3.py
new file mode 100644
index 00000000000000..2b1a19be4ae2c9
--- /dev/null
+++ b/src/transformers/models/phi3/modular_phi3.py
@@ -0,0 +1,323 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phi-3 model."""
+
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..mistral.modeling_mistral import (
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralForSequenceClassification,
+    MistralForTokenClassification,
+    MistralPreTrainedModel,
+    MistralRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_phi3 import Phi3Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
+_CONFIG_FOR_DOC = "Phi3Config"
+
+
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.config.num_attention_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Phi3DecoderLayer(MistralDecoderLayer):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.config = config
+        self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            past_key_value (`Cache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.resid_attn_dropout(hidden_states)  # main diff with Llama
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)  # main diff with Llama
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Phi3RotaryEmbedding(MistralRotaryEmbedding):
+    def __init__(self, config: Phi3Config, device=None):
+        super().__init__(config, device)
+
+    def _longrope_frequency_update(self, position_ids, device):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = torch.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(
+                    self.config, device, seq_len=original_max_position_embeddings + 1
+                )
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        elif self.rope_type == "longrope":
+            self._longrope_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Phi3PreTrainedModel(MistralPreTrainedModel):
+    _version = "0.0.5"
+
+
+class Phi3ForCausalLM(MistralForCausalLM, Phi3PreTrainedModel):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        model_inputs = Phi3PreTrainedModel().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        return model_inputs
+
+
+class Phi3ForSequenceClassification(MistralForSequenceClassification):
+    pass
+
+
+class Phi3ForTokenClassification(MistralForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Phi3PreTrainedModel",
+    "Phi3Model",  # noqa: F822
+    "Phi3ForCausalLM",
+    "Phi3ForSequenceClassification",
+    "Phi3ForTokenClassification",
+]
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
index 8f6b092da6e6ad..b540dd18300ea2 100644
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -910,6 +910,7 @@ class PhimoePreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -1173,7 +1174,7 @@ def forward(
             router_logits=all_router_logits,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Phimoe
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1183,6 +1184,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Phimoe. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index f6924c4784b3ac..db2f0ff7e3ca31 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """Pix2Struct model configuration"""
 
-import os
-from typing import Union
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -147,26 +144,6 @@ def __init__(
             **kwargs,
         )
 
-    @classmethod
-    def from_pretrained(
-        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from Pix2StructConfig
-        if config_dict.get("model_type") == "pix2struct":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Pix2StructVisionConfig(PretrainedConfig):
     r"""
@@ -266,26 +243,6 @@ def __init__(
         self.relative_attention_max_distance = relative_attention_max_distance
         self.d_kv = d_kv
 
-    @classmethod
-    def from_pretrained(
-        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from Pix2StructConfig
-        if config_dict.get("model_type") == "pix2struct":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-
-        return cls.from_dict(config_dict, **kwargs)
-
 
 class Pix2StructConfig(PretrainedConfig):
     r"""
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index ba5f1421ed4af1..77ce68659a50f7 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1733,14 +1733,6 @@ def get_output_embeddings(self) -> nn.Module:
     def set_output_embeddings(self, new_embeddings):
         self.decoder.set_output_embeddings(new_embeddings)
 
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
-        model_embeds = self.decoder.resize_token_embeddings(new_num_tokens)
-
-        # update vocab size
-        self.config.text_config.vocab_size = new_num_tokens
-
-        return model_embeds
-
     def get_decoder(self):
         return self.decoder
 
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index c4190082d99471..a8b3ae5024ccb1 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import json
+import os
 
 import regex as re
 import torch
@@ -27,7 +29,6 @@
     PixtralImageProcessor,
     PixtralProcessor,
     PixtralVisionConfig,
-    PreTrainedTokenizerFast,
 )
 from transformers.convert_slow_tokenizer import bytes_to_unicode
 
@@ -156,29 +157,18 @@ def converted(self) -> Tokenizer:
         return tokenizer
 
 
-def convert_mistral_tokenizer():
-    model_name = "mistralai/Pixtral-12B-2409"
+def convert_mistral_tokenizer(model_file):
+    from transformers import LlamaTokenizer
 
-    tokenizer = MistralTokenizer.from_model(model_name)
-
-    vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
-    all_special = [
-        token.value if hasattr(token, "value") else token
-        for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
-    ]
-    specials_tokens = {token: all_special.index(token) for token in all_special}
-    specials_tokens.update(vocab)
-    vocab = specials_tokens
-
-    tokenizer = PreTrainedTokenizerFast(
-        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
-        bos_token="<s>",
-        unk_token="<unk>",
-        eos_token="</s>",
-    )
-    tokenizer.model_input_names = ["input_ids", "attention_mask"]
-
-    return tokenizer
+    mistral_tokenizer = MistralTokenizer.from_file(model_file)
+    vocab = mistral_tokenizer.instruct_tokenizer.tokenizer.vocab()
+    control_token_ids = mistral_tokenizer.instruct_tokenizer.tokenizer._control_tokens
+    all_special = [vocab[id] for id in control_token_ids]
+    hf_tokenizer = LlamaTokenizer(model_file)
+    # Do I need to exclude tokens that are already special?
+    hf_tokenizer.add_special_tokens({"additional_special_tokens": all_special})
+    hf_tokenizer.model_input_names = ["input_ids", "attention_mask"]
+    return hf_tokenizer
 
 
 def permute_for_rope(value, n_heads, config):
@@ -187,7 +177,7 @@ def permute_for_rope(value, n_heads, config):
     return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
 
 
-def convert_dictionnary(original_state_dict, vision_config, text_config):
+def convert_dictionary(original_state_dict, vision_config, text_config):
     new_dict = {}
 
     all_keys = "\n" + "\n".join(original_state_dict.keys())
@@ -208,7 +198,6 @@ def convert_dictionnary(original_state_dict, vision_config, text_config):
                 num_attention_heads = _config.num_attention_heads
             if "k_proj" in new_key:
                 num_attention_heads = _config.num_key_value_heads
-            # convert the text model (basically mistral model)
 
         if "q_proj" in new_key or "k_proj" in new_key:
             value = permute_for_rope(value, num_attention_heads, _config)
@@ -217,29 +206,57 @@ def convert_dictionnary(original_state_dict, vision_config, text_config):
     return new_dict
 
 
-def convert_mistral_model(input_dir, output_dir):
-    text_config = MistralConfig(
-        attention_dropout=0.0,
-        bos_token_id=1,
-        eos_token_id=2,
-        head_dim=128,
-        hidden_act="silu",
-        hidden_size=5120,
-        initializer_range=0.02,
-        intermediate_size=14336,
-        max_position_embeddings=1024000,
-        model_type="mistral",
-        num_attention_heads=32,
-        num_hidden_layers=40,
-        num_key_value_heads=8,
-        rms_norm_eps=1e-05,
-        rope_theta=1000000000.0,
-        sliding_window=None,
-        tie_word_embeddings=False,
-        vocab_size=131072,
-    )
+MISTRAL_CONFIG_MAPPING = {
+    "dim": "hidden_size",
+    "hidden_dim": "intermediate_size",
+    "n_kv_heads": "num_key_value_heads",
+    "n_heads": "num_attention_heads",
+    "n_layers": "num_hidden_layers",
+}
+
 
-    vision_config = PixtralVisionConfig()
+def convert_mistral_model(input_dir, output_dir):
+    vision_config = {}
+    if os.path.isfile(f"{input_dir}/params.json"):
+        with open(f"{input_dir}/params.json") as f:
+            param_json = json.load(f)
+        vision_config = param_json.pop("vision_encoder")
+        for k, v in MISTRAL_CONFIG_MAPPING.items():
+            value = param_json.pop(k)
+            param_json[v] = value
+        if "hidden_act" not in vision_config:
+            vision_config["hidden_act"] = "silu"
+        text_config = MistralConfig(
+            **param_json,
+            hidden_act="silu",
+            sliding_window=None,
+            tie_word_embeddings=False,
+            is_composition=True,
+            rms_norm_eps=1e-5,
+        )
+    else:
+        text_config = MistralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            head_dim=128,
+            hidden_act="silu",
+            hidden_size=5120,
+            initializer_range=0.02,
+            intermediate_size=14336,
+            max_position_embeddings=1024000,
+            model_type="mistral",
+            num_attention_heads=32,
+            num_hidden_layers=40,
+            num_key_value_heads=8,
+            rms_norm_eps=1e-05,
+            rope_theta=1000000000.0,
+            sliding_window=None,
+            tie_word_embeddings=False,
+            vocab_size=131072,
+        )
+    adapter_bias = vision_config.pop("adapter_bias", True)
+    vision_config = PixtralVisionConfig(**vision_config)
     config = LlavaConfig(
         vision_config,
         text_config,
@@ -247,38 +264,55 @@ def convert_mistral_model(input_dir, output_dir):
         image_token_index=10,
         vision_feature_select_strategy="full",
         image_seq_length=1,
+        multimodal_projector_bias=adapter_bias,
     )
     config.architectures = ["LlavaForConditionalGeneration"]
     config.save_pretrained(output_dir)
-
-    original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
-    new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
-
+    full_original_state_dict = {}
+    safetensors_files = sorted([file for file in os.listdir(input_dir) if file.endswith(".safetensors")])
+    if len(safetensors_files) == 1:
+        full_original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
+    else:
+        for file in safetensors_files:
+            loaded_dict = safe_load_file(f"{input_dir}/{file}")
+            full_original_state_dict.update(loaded_dict)
+
+    new_dict = convert_dictionary(full_original_state_dict, vision_config, text_config)
     with torch.device("meta"):
         model = LlavaForConditionalGeneration(config)
     model.load_state_dict(new_dict, strict=True, assign=True)
-
     model.save_pretrained(output_dir)
 
-    tokenizer = convert_mistral_tokenizer()
-    image_processor = PixtralImageProcessor()
-    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    processor.save_pretrained(output_dir)
-
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--input_dir",
         help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+        required=True,
     )
     parser.add_argument(
         "--output_dir",
         help="Location to write HF model and tokenizer",
+        required=True,
+    )
+    parser.add_argument(
+        "--tokenizer_file", help="Location of the specific tokenizer model file to use.", required=True
+    )
+    parser.add_argument(
+        "--chat_template_file",
+        help="Optional file containing a raw chat template. Will be set as the processor's chat template.",
+        required=False,
     )
 
     args = parser.parse_args()
     convert_mistral_model(args.input_dir, args.output_dir)
+    tokenizer = convert_mistral_tokenizer(args.tokenizer_file)
+    image_processor = PixtralImageProcessor()
+    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+    if args.chat_template_file:
+        processor.chat_template = open(args.chat_template_file).read()
+    processor.save_pretrained(args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 0fcd032d2c0c12..6d83e0c4647148 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -37,7 +37,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
 
@@ -63,10 +63,24 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
         Returns:
             [`BatchFeature`]: The same instance after modification.
         """
+
+        def _recursive_to(obj, device, *args, **kwargs):
+            # Lists can be nested, so keep digging until we hit tensors
+            if isinstance(obj, list):
+                return [_recursive_to(o, device, *args, **kwargs) for o in obj]
+            # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+            elif isinstance(obj, torch.Tensor) and torch.is_floating_point(obj):
+                # cast and send to device
+                return obj.to(*args, **kwargs)
+            elif isinstance(obj, torch.Tensor) and device is not None:
+                # only send to device, don't cast
+                return obj.to(device=device)
+            else:
+                return obj
+
         requires_backends(self, ["torch"])
         import torch  # noqa
 
-        new_data = {}
         device = kwargs.get("device")
         # Check if the args are a device or a dtype
         if device is None and len(args) > 0:
@@ -80,21 +94,8 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
             else:
                 # it's something else
                 raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
-        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
-        for k, v in self.items():
-            # check if v is a floating point
-            if isinstance(v, list):
-                new_data[k] = [
-                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
-                ]
-            elif isinstance(v, torch.Tensor) and torch.is_floating_point(v):
-                # cast and send to device
-                new_data[k] = v.to(*args, **kwargs)
-            elif isinstance(v, torch.Tensor) and device is not None:
-                new_data[k] = v.to(device=device)
-            else:
-                new_data[k] = v
-        self.data = new_data
+
+        self.data = {k: _recursive_to(v, device, *args, **kwargs) for k, v in self.data.items()}
         return self
 
 
@@ -473,7 +474,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
-        if is_scaled_image(images_list[0][0]) and do_rescale:
+        if do_rescale and is_scaled_image(images_list[0][0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 0b7d2dfdd836d7..905eef22ca3d00 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -126,7 +126,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -491,18 +490,20 @@ def forward(
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
-        patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values]
+        if len(pixel_values) > 1:
+            raise ValueError("Batching/padding not supported yet!")
+        patch_embeds_list = [self.patch_conv(img.to(self.dtype)) for sample in pixel_values for img in sample]
 
         # flatten to a single sequence
-        patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = torch.cat([p.flatten(1).T for p in patch_embeds_list], dim=0).unsqueeze(0)
         patch_embeds = self.ln_pre(patch_embeds)
-
         # positional embeddings
         position_ids = position_ids_in_meshgrid(
             patch_embeds_list, max_width=self.config.image_size // self.config.patch_size
         ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
+
         attention_mask = generate_block_attention_mask(
             [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
         )
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 0b5354daad37fa..e60151130ae02f 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -22,7 +22,7 @@
 from ...image_utils import ImageInput, is_valid_image, load_image
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
+from ...utils import is_torch_device, is_torch_dtype, logging, requires_backends
 
 
 logger = logging.get_logger(__name__)
@@ -66,10 +66,24 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
         Returns:
             [`BatchFeature`]: The same instance after modification.
         """
+
+        def _recursive_to(obj, device, *args, **kwargs):
+            # Lists can be nested, so keep digging until we hit tensors
+            if isinstance(obj, list):
+                return [_recursive_to(o, device, *args, **kwargs) for o in obj]
+            # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+            elif isinstance(obj, torch.Tensor) and torch.is_floating_point(obj):
+                # cast and send to device
+                return obj.to(*args, **kwargs)
+            elif isinstance(obj, torch.Tensor) and device is not None:
+                # only send to device, don't cast
+                return obj.to(device=device)
+            else:
+                return obj
+
         requires_backends(self, ["torch"])
         import torch  # noqa
 
-        new_data = {}
         device = kwargs.get("device")
         # Check if the args are a device or a dtype
         if device is None and len(args) > 0:
@@ -83,21 +97,8 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
             else:
                 # it's something else
                 raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
-        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
-        for k, v in self.items():
-            # check if v is a floating point
-            if isinstance(v, list):
-                new_data[k] = [
-                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
-                ]
-            elif isinstance(v, torch.Tensor) and torch.is_floating_point(v):
-                # cast and send to device
-                new_data[k] = v.to(*args, **kwargs)
-            elif isinstance(v, torch.Tensor) and device is not None:
-                new_data[k] = v.to(device=device)
-            else:
-                new_data[k] = v
-        self.data = new_data
+
+        self.data = {k: _recursive_to(v, device, *args, **kwargs) for k, v in self.data.items()}
         return self
 
 
@@ -204,12 +205,21 @@ def __call__(
 
         if images is not None:
             if is_image_or_image_url(images):
-                images = [[images]]
-            elif isinstance(images, list) and is_image_or_image_url(images[0]):
-                if isinstance(text, list):
-                    images = [[im] for im in images]
+                if isinstance(text, str) or isinstance(text, list) and len(text) == 1:
+                    # If there's a single sample, the image must belong to it
+                    images = [[images]]
                 else:
+                    raise ValueError(
+                        "You have supplied multiple text samples, but `images` is not a nested list. When processing multiple samples, `images` should be a list of lists of images, one list per sample."
+                    )
+            elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                if isinstance(text, str) or isinstance(text, list) and len(text) == 1:
+                    # If there's a single sample, all images must belong to it
                     images = [images]
+                else:
+                    raise ValueError(
+                        "You have supplied multiple text samples, but `images` is not a nested list. When processing multiple samples, `images` should be a list of lists of images, one list per sample."
+                    )
             elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]):
                 pass
             else:
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index 0dfb66acf588c7..624f1a6f40e366 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -314,7 +314,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
index f37442a4bdb387..75fcba6ea14f0e 100644
--- a/src/transformers/models/pvt/image_processing_pvt.py
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -237,7 +237,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 5dba7594e7e9a1..f8be4e3740f4b1 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -283,13 +283,8 @@ def forward(
 
 
 class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: Qwen2Config,
-        device=None,
-    ):
+    def __init__(self, config: Qwen2Config, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -301,7 +296,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -313,13 +308,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -376,6 +372,7 @@ class Qwen2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
index 5c02f3bc21d541..a844a67861d540 100644
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@@ -856,6 +856,9 @@ def __init__(self, config: Qwen2AudioConfig):
         self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         self.post_init()
@@ -894,18 +897,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_audio_features(
         self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels
     ):
@@ -1197,9 +1188,34 @@ def forward(
                 selected_audio_feature = audio_outputs.last_hidden_state
                 audio_features = self.multi_modal_projector(selected_audio_feature)
 
-                inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
-                    audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
-                )
+                # if we have consecutive audio tokens, then it means we expanded input_ids in processing
+                audio_tokens = input_ids == self.config.audio_token_index
+                legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0
+
+                if legacy_processing:
+                    logger.warning_once(
+                        "Expanding inputs for audio tokens in Qwen2Audio should be done in processing."
+                    )
+                    inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
+                        audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    num_audios, max_audio_tokens, embed_dim = audio_features.shape
+                    audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :]
+                    audio_features_mask = audio_features_mask < audio_output_lengths[:, None]
+                    audio_features = audio_features[audio_features_mask]
+
+                    n_audio_tokens = (input_ids == self.config.audio_token_index).sum().item()
+                    n_audio_features = audio_features.shape[0]
+
+                    if n_audio_tokens != n_audio_features:
+                        raise ValueError(
+                            f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}"
+                        )
+                    special_audio_mask = (input_ids == self.config.audio_token_index).to(inputs_embeds.device)
+                    special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                    audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index 3fa2252f729d64..5eee95398b363d 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -40,15 +40,32 @@ class Qwen2AudioProcessor(ProcessorMixin):
         chat_template (`Optional[str]`, *optional*):
                 The Jinja template to use for formatting the conversation. If not provided, the default chat template
                 is used.
+        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
+            The token to use for audio tokens.
+        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
+            The token to use for audio bos tokens.
+        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
+            The token to use for audio eos tokens.
     """
 
     attributes = ["feature_extractor", "tokenizer"]
     feature_extractor_class = "WhisperFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, feature_extractor=None, tokenizer=None, chat_template=None):
+    def __init__(
+        self,
+        feature_extractor=None,
+        tokenizer=None,
+        chat_template=None,
+        audio_token="<|AUDIO|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    ):
         if chat_template is None:
             chat_template = self.default_chat_template
+        self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
+        self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
+        self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -88,7 +105,18 @@ def __call__(
 
         if text is None:
             raise ValueError("You need to specify either a `text` input to process.")
-        inputs = self.tokenizer(text, padding=padding, **kwargs)
+        elif isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # ensure we have as much audios as audio tokens
+        num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
+        num_audios = 1 if type(audios) == np.ndarray else len(audios)
+        if num_audio_tokens != num_audios:
+            raise ValueError(
+                f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
+            )
 
         if audios is not None:
             audio_inputs = self.feature_extractor(
@@ -97,6 +125,46 @@ def __call__(
             audio_inputs["feature_attention_mask"] = audio_inputs.pop(
                 "attention_mask"
             )  # rename attention_mask to prevent conflicts later on
+
+            expanded_text = []
+            audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
+
+            for sample in text:
+                replace_str = []
+                while self.audio_token in sample:
+                    audio_length = audio_lengths.pop(0)
+                    input_length = (audio_length - 1) // 2 + 1
+                    num_audio_tokens = (input_length - 2) // 2 + 1
+
+                    expanded_audio_token = self.audio_token * num_audio_tokens
+
+                    audio_token_start_idx = sample.find(self.audio_token)
+                    audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
+
+                    has_bos = (
+                        sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
+                        == self.audio_bos_token
+                    )
+                    has_eos = (
+                        sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
+                        == self.audio_eos_token
+                    )
+
+                    # Check if this audio token is surrounded by bos/eos tokens
+                    if not has_bos and not has_eos:
+                        expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token
+
+                    replace_str.append(expanded_audio_token)
+                    sample = sample.replace(self.audio_token, "<placeholder>", 1)
+
+                while "<placeholder>" in sample:
+                    sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
+                expanded_text.append(sample)
+            text = expanded_text
+
+        inputs = self.tokenizer(text, padding=padding, **kwargs)
+
+        if audios is not None:
             inputs.update(audio_inputs)
 
         return BatchFeature(data={**inputs})
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index a8ad23ba61172d..0f61323f403083 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -167,13 +167,8 @@ def extra_repr(self):
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2Moe
 class Qwen2MoeRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: Qwen2MoeConfig,
-        device=None,
-    ):
+    def __init__(self, config: Qwen2MoeConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -185,7 +180,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -197,13 +192,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -1064,7 +1060,7 @@ def forward(
             router_logits=all_router_logits,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Qwen2Moe
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1074,6 +1070,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen2Moe. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index cc0fd33540306f..b8656a91031820 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -252,7 +252,7 @@ def _preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
@@ -291,8 +291,9 @@ def _preprocess(
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
         grid_t = patches.shape[0] // self.temporal_patch_size
         grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 0f04b1d5ed463d..4893143e2c1664 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -32,13 +32,8 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
-from ...modeling_attn_mask_utils import (
-    AttentionMaskConverter,
-)
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-    ModelOutput,
-)
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -1165,7 +1160,7 @@ def forward(
             attentions=all_self_attns,
         )
 
-    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
+    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Qwen2VL
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -1175,6 +1170,14 @@ def _update_causal_mask(
         output_attentions: bool,
     ):
         if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen2VL. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1420,7 +1423,7 @@ def get_decoder(self):
 
     def get_rope_index(
         self,
-        input_ids: torch.LongTensor,
+        input_ids: Optional[torch.LongTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -1550,7 +1553,7 @@ def get_rope_index(
             if attention_mask is not None:
                 position_ids = attention_mask.long().cumsum(-1) - 1
                 position_ids.masked_fill_(attention_mask == 0, 1)
-                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
                 max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
                 mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
             else:
@@ -1676,7 +1679,7 @@ def forward(
                 attention_mask = attention_mask.to(inputs_embeds.device)
 
         # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
-        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
+        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
             # calculate RoPE index once per generation in the pre-fill stage only
             if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
                 position_ids, rope_deltas = self.get_rope_index(
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 336148f413803d..cf288f4d8c7c11 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -17,7 +17,7 @@
 import json
 from typing import List, Optional, Tuple
 
-from tokenizers import pre_tokenizers, processors
+from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -154,14 +154,6 @@ def __init__(
             **kwargs,
         )
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
         tokenizer_component = "post_processor"
         tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
         if tokenizer_component_instance:
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index d031f40848fb8c..b3e75972cc655c 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -935,7 +935,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 5d8d0f58328a0b..70ce29a211d224 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Image processor class for RT-DETR."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_rt_detr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import functools
 import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -26,10 +16,7 @@
     get_max_height_width,
     safe_squeeze,
 )
-from ...image_transforms import (
-    center_to_corners_format,
-    corners_to_center_format,
-)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -51,30 +38,25 @@
     is_torch_available,
     is_torchvision_available,
     is_torchvision_v2_available,
-    logging,
+    is_vision_available,
     requires_backends,
 )
-from .image_processing_rt_detr import (
-    get_size_with_aspect_ratio,
-)
+from .image_processing_rt_detr import get_size_with_aspect_ratio
 
 
 if is_torch_available():
     import torch
 
-
-if is_torchvision_available():
+if is_vision_available():
     from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_torchvision_v2_available():
-        from torchvision.transforms.v2 import functional as F
-    else:
-        from torchvision.transforms import functional as F
-
 
-logger = logging.get_logger(__name__)
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
 
-SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 def prepare_coco_detection_annotation(
@@ -138,13 +120,13 @@ def prepare_coco_detection_annotation(
 
 class RTDetrImageProcessorFast(BaseImageProcessorFast):
     r"""
-    Constructs a fast RT-DETR DETR image processor.
+    Constructs a fast RTDetr image processor.
 
     Args:
         format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
             overridden by the `do_resize` parameter in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
             Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
@@ -175,7 +157,7 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
             Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
             for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_annotations (`bool`, *optional*, defaults to `True`):
-            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
             bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
             Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
         do_pad (`bool`, *optional*, defaults to `False`):
@@ -237,7 +219,7 @@ def prepare_annotation(
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
-        Prepare an annotation for feeding into RTDETR model.
+        Prepare an annotation for feeding into RT_DETR model.
         """
         format = format if format is not None else self.format
 
@@ -250,7 +232,6 @@ def prepare_annotation(
             raise ValueError(f"Format {format} is not supported.")
         return target
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
     def resize(
         self,
         image: torch.Tensor,
@@ -305,7 +286,6 @@ def resize(
         )
         return image
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
     def resize_annotation(
         self,
         annotation: Dict[str, Any],
@@ -359,7 +339,6 @@ def resize_annotation(
 
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         image_height, image_width = image_size
         norm_annotation = {}
@@ -375,7 +354,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
                 norm_annotation[key] = value
         return norm_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
     def _update_annotation_for_padded_image(
         self,
         annotation: Dict,
@@ -411,7 +389,6 @@ def _update_annotation_for_padded_image(
                 new_annotation[key] = value
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
     def pad(
         self,
         image: torch.Tensor,
@@ -443,7 +420,6 @@ def pad(
         return image, pixel_mask, annotation
 
     @functools.lru_cache(maxsize=1)
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
     def _validate_input_arguments(
         self,
         do_rescale: bool,
@@ -726,7 +702,6 @@ def preprocess(
             ]
         return encoded_inputs
 
-    # Copied from transformers.models.rt_detr.image_processing_rt_detr.RTDetrImageProcessor.post_process_object_detection
     def post_process_object_detection(
         self,
         outputs,
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
new file mode 100644
index 00000000000000..59f2ea566e9747
--- /dev/null
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -0,0 +1,577 @@
+import pathlib
+from typing import Dict, List, Optional, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    SizeDict,
+    get_max_height_width,
+)
+from ...image_transforms import center_to_corners_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_size,
+    get_image_type,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    validate_annotations,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RT-DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
+    r"""
+    Constructs a fast RTDetr image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = False,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_convert_annotations: bool = True,
+        do_pad: bool = False,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        size = size if size is not None else {"height": 640, "width": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        BaseImageProcessorFast.__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    @filter_out_non_signature_kwargs(extra=["device"])
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        return_segmentation_masks: bool = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`Dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=True)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+        return_tensors = "pt" if return_tensors is None else return_tensors
+        device = kwargs.pop("device", None)
+
+        # Make hashable for cache
+        size = SizeDict(**size)
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        images = make_list_of_images(images)
+        image_type = get_image_type(images[0])
+
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        self._validate_input_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            return_tensors=return_tensors,
+            data_format=data_format,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        data = {}
+        if image_type == ImageType.PIL:
+            images = [F.pil_to_tensor(image) for image in images]
+        elif image_type == ImageType.NUMPY:
+            # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+            images = [torch.from_numpy(image).contiguous() for image in images]
+
+        if device is not None:
+            images = [image.to(device) for image in images]
+
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images[0])
+        if input_data_format == ChannelDimension.LAST:
+            images = [image.permute(2, 0, 1).contiguous() for image in images]
+            input_data_format = ChannelDimension.FIRST
+
+        if do_rescale and do_normalize:
+            # fused rescale and normalize
+            new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor)
+            new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor)
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                interpolation = (
+                    pil_torch_interpolation_mapping[resample]
+                    if isinstance(resample, (PILImageResampling, int))
+                    else resample
+                )
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+
+            if do_rescale and do_normalize:
+                # fused rescale and normalize
+                image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std)
+            elif do_rescale:
+                image = image * rescale_factor
+            elif do_normalize:
+                image = F.normalize(image, image_mean, image_std)
+
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, List):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+    def from_dict():
+        raise NotImplementedError("No need to override this method for RT-DETR yet.")
+
+    def post_process():
+        raise NotImplementedError("Post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_instance():
+        raise NotImplementedError("Instance post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_panoptic():
+        raise NotImplementedError("Panoptic post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_instance_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_semantic_segmentation():
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_panoptic_segmentation():
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.")
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index a0d54b4afad3f4..a1cace89fa1f76 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -316,7 +316,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index d6662f1f620fea..51b4cd27cab0b2 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -233,7 +233,7 @@ def _preprocess_image(
         """Preprocesses a single image."""
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/seggpt/image_processing_seggpt.py b/src/transformers/models/seggpt/image_processing_seggpt.py
index 4a91ee76e3ebfd..bcc1ad32efe016 100644
--- a/src/transformers/models/seggpt/image_processing_seggpt.py
+++ b/src/transformers/models/seggpt/image_processing_seggpt.py
@@ -346,7 +346,7 @@ def _preprocess_step(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/siglip/image_processing_siglip.py b/src/transformers/models/siglip/image_processing_siglip.py
index 5b22ac51689538..b87adb7492d132 100644
--- a/src/transformers/models/siglip/image_processing_siglip.py
+++ b/src/transformers/models/siglip/image_processing_siglip.py
@@ -204,7 +204,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 819f8c487f1d50..d8a317493a1013 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -340,6 +340,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index dfadca184483e4..9fa099d19230d5 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -261,6 +261,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def get_input_embeddings(self):
+        return self.decoder.get_input_embeddings()
+
     def get_output_embeddings(self):
         return self.decoder.get_output_embeddings()
 
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index e67c28efcd8940..4cdab6dc4d2db5 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -63,13 +63,8 @@
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->StableLm
 class StableLmRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: StableLmConfig,
-        device=None,
-    ):
+    def __init__(self, config: StableLmConfig, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -81,7 +76,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -93,13 +88,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 3b4fdbcb81ccc4..8620c7d69d56a2 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -274,13 +274,8 @@ def forward(
 
 
 class Starcoder2RotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: Starcoder2Config,
-        device=None,
-    ):
+    def __init__(self, config: Starcoder2Config, device=None):
         super().__init__()
-        self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -292,7 +287,7 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -304,13 +299,14 @@ def _dynamic_frequency_update(self, position_ids, device):
         """
         seq_len = torch.max(position_ids) + 1
         if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
         if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -367,6 +363,7 @@ class Starcoder2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 9ae1034484386a..2d60b9fc042f9c 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -241,7 +241,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index 3ab551ebc6e7ca..2832e531602e9d 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -176,7 +176,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 68d750aca5668b..8eb652728bfac9 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -124,6 +124,7 @@ def __init__(
             pad_token=pad_token,
             extra_ids=extra_ids,
             additional_special_tokens=additional_special_tokens,
+            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
new file mode 100644
index 00000000000000..8f04a680b21f48
--- /dev/null
+++ b/src/transformers/models/textnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_textnet import *
+    from .image_processing_textnet import *
+    from .modeling_textnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
new file mode 100644
index 00000000000000..61ecaaeba8e554
--- /dev/null
+++ b/src/transformers/models/textnet/configuration_textnet.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TextNet model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
+    TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [czczup/textnet-base](https://huggingface.co/czczup/textnet-base). Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        stem_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size for the initial convolution layer.
+        stem_stride (`int`, *optional*, defaults to 2):
+            The stride for the initial convolution layer.
+        stem_num_channels (`int`, *optional*, defaults to 3):
+            The num of channels in input for the initial convolution layer.
+        stem_out_channels (`int`, *optional*, defaults to 64):
+            The num of channels in out for the initial convolution layer.
+        stem_act_func (`str`, *optional*, defaults to `"relu"`):
+            The activation function for the initial convolution layer.
+        image_size (`Tuple[int, int]`, *optional*, defaults to `[640, 640]`):
+            The size (resolution) of each image.
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*):
+            A list of stage-wise kernel sizes. If `None`, defaults to:
+            `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`.
+        conv_layer_strides (`List[List[int]]`, *optional*):
+            A list of stage-wise strides. If `None`, defaults to:
+            `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
+            Dimensionality (hidden size) at each stage.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Examples:
+
+    ```python
+    >>> from transformers import TextNetConfig, TextNetBackbone
+
+    >>> # Initializing a TextNetConfig
+    >>> configuration = TextNetConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = TextNetBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "textnet"
+
+    def __init__(
+        self,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_num_channels=3,
+        stem_out_channels=64,
+        stem_act_func="relu",
+        image_size=[640, 640],
+        conv_layer_kernel_sizes=None,
+        conv_layer_strides=None,
+        hidden_sizes=[64, 64, 128, 256, 512],
+        batch_norm_eps=1e-5,
+        initializer_range=0.02,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if conv_layer_kernel_sizes is None:
+            conv_layer_kernel_sizes = [
+                [[3, 3], [3, 3], [3, 3]],
+                [[3, 3], [1, 3], [3, 3], [3, 1]],
+                [[3, 3], [3, 3], [3, 1], [1, 3]],
+                [[3, 3], [3, 1], [1, 3], [3, 3]],
+            ]
+        if conv_layer_strides is None:
+            conv_layer_strides = [[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]
+
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_num_channels = stem_num_channels
+        self.stem_out_channels = stem_out_channels
+        self.stem_act_func = stem_act_func
+
+        self.image_size = image_size
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
+
+        self.initializer_range = initializer_range
+        self.hidden_sizes = hidden_sizes
+        self.batch_norm_eps = batch_norm_eps
+
+        self.depths = [len(layer) for layer in self.conv_layer_kernel_sizes]
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["TextNetConfig"]
diff --git a/src/transformers/models/textnet/convert_textnet_to_hf.py b/src/transformers/models/textnet/convert_textnet_to_hf.py
new file mode 100644
index 00000000000000..a8a004d18a35e1
--- /dev/null
+++ b/src/transformers/models/textnet/convert_textnet_to_hf.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import re
+from collections import OrderedDict
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
+
+
+tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
+small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
+base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
+
+rename_key_mappings = {
+    "module.backbone": "textnet",
+    "first_conv": "stem",
+    "bn": "batch_norm",
+    "ver": "vertical",
+    "hor": "horizontal",
+}
+
+
+def prepare_config(size_config_url, size):
+    config_dict = json.loads(requests.get(size_config_url).text)
+
+    backbone_config = {}
+    for stage_ix in range(1, 5):
+        stage_config = config_dict[f"stage{stage_ix}"]
+
+        merged_dict = {}
+
+        # Iterate through the list of dictionaries
+        for layer in stage_config:
+            for key, value in layer.items():
+                if key != "name":
+                    # Check if the key is already in the merged_dict
+                    if key in merged_dict:
+                        merged_dict[key].append(value)
+                    else:
+                        # If the key is not in merged_dict, create a new list with the value
+                        merged_dict[key] = [value]
+        backbone_config[f"stage{stage_ix}"] = merged_dict
+
+    neck_in_channels = []
+    neck_out_channels = []
+    neck_kernel_size = []
+    neck_stride = []
+    neck_dilation = []
+    neck_groups = []
+
+    for i in range(1, 5):
+        layer_key = f"reduce_layer{i}"
+        layer_dict = config_dict["neck"].get(layer_key)
+
+        if layer_dict:
+            # Append values to the corresponding lists
+            neck_in_channels.append(layer_dict["in_channels"])
+            neck_out_channels.append(layer_dict["out_channels"])
+            neck_kernel_size.append(layer_dict["kernel_size"])
+            neck_stride.append(layer_dict["stride"])
+            neck_dilation.append(layer_dict["dilation"])
+            neck_groups.append(layer_dict["groups"])
+
+    textnet_config = TextNetConfig(
+        stem_kernel_size=config_dict["first_conv"]["kernel_size"],
+        stem_stride=config_dict["first_conv"]["stride"],
+        stem_num_channels=config_dict["first_conv"]["in_channels"],
+        stem_out_channels=config_dict["first_conv"]["out_channels"],
+        stem_act_func=config_dict["first_conv"]["act_func"],
+        conv_layer_kernel_sizes=[
+            backbone_config["stage1"]["kernel_size"],
+            backbone_config["stage2"]["kernel_size"],
+            backbone_config["stage3"]["kernel_size"],
+            backbone_config["stage4"]["kernel_size"],
+        ],
+        conv_layer_strides=[
+            backbone_config["stage1"]["stride"],
+            backbone_config["stage2"]["stride"],
+            backbone_config["stage3"]["stride"],
+            backbone_config["stage4"]["stride"],
+        ],
+        hidden_sizes=[
+            config_dict["first_conv"]["out_channels"],
+            backbone_config["stage1"]["out_channels"][-1],
+            backbone_config["stage2"]["out_channels"][-1],
+            backbone_config["stage3"]["out_channels"][-1],
+            backbone_config["stage4"]["out_channels"][-1],
+        ],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+    )
+
+    return textnet_config
+
+
+def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
+    config_filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
+
+    with open(config_filepath) as f:
+        content = json.loads(f.read())
+
+    size = content[checkpoint_config_filename]["short_size"]
+
+    if "tiny" in content[checkpoint_config_filename]["config"]:
+        config = prepare_config(tiny_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 1.1221]
+        )
+    elif "small" in content[checkpoint_config_filename]["config"]:
+        config = prepare_config(small_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1394]
+        )
+    else:
+        config = prepare_config(base_config_url, size)
+        expected_slice_backbone = torch.tensor(
+            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000]
+        )
+
+    model = TextNetBackbone(config)
+    textnet_image_processor = TextNetImageProcessor(size={"shortest_edge": size})
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
+    state_dict_changed = OrderedDict()
+    for key in state_dict:
+        if "backbone" in key:
+            val = state_dict[key]
+            new_key = key
+            for search, replacement in rename_key_mappings.items():
+                if search in new_key:
+                    new_key = new_key.replace(search, replacement)
+
+            pattern = r"textnet\.stage(\d)"
+
+            def adjust_stage(match):
+                stage_number = int(match.group(1)) - 1
+                return f"textnet.encoder.stages.{stage_number}.stage"
+
+            # Using regex to find and replace the pattern in the string
+            new_key = re.sub(pattern, adjust_stage, new_key)
+            state_dict_changed[new_key] = val
+    model.load_state_dict(state_dict_changed)
+    model.eval()
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    original_pixel_values = torch.tensor(
+        [0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309, 0.4337]
+    )
+    pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values[0][0][3][:10], atol=1e-4)
+
+    with torch.no_grad():
+        output = model(pixel_values)
+
+    assert torch.allclose(output["feature_maps"][-1][0][10][12][:10].detach(), expected_slice_backbone, atol=1e-3)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
+    logging.info("The converted weights are saved here : " + pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://github.com/czczup/FAST/releases/download/release/fast_base_ic17mlt_640.pth",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--checkpoint_config_filename",
+        default="fast_base_ic17mlt_640.py",
+        type=str,
+        help="URL to the original PyTorch checkpoint (.pth file).",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+
+    convert_textnet_checkpoint(
+        args.checkpoint_url,
+        args.checkpoint_config_filename,
+        args.pytorch_dump_folder_path,
+    )
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
new file mode 100644
index 00000000000000..945ebe63fb830f
--- /dev/null
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TextNet."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+if is_vision_available():
+    import PIL
+
+
+class TextNetImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TextNet image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 640}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        size_divisor (`int`, *optional*, defaults to 32):
+            Ensures height and width are rounded to a multiple of this value after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `False`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_MEAN,
+        image_std: Optional[Union[float, List[float]]] = IMAGENET_DEFAULT_STD,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 640}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.size_divisor = size_divisor
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_convert_rgb = do_convert_rgb
+
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "size_divisor",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"] , with the longest edge
+        resized to keep the input aspect ratio. Both the height and width are resized to be divisible by 32.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            size_divisor (`int`, *optional*, defaults to `32`):
+                Ensures height and width are rounded to a multiple of this value after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            default_to_square (`bool`, *optional*, defaults to `False`):
+                The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
+                `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or
+                not.Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
+        """
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        height, width = get_resize_output_image_size(
+            image, size=size, input_data_format=input_data_format, default_to_square=False
+        )
+        if height % self.size_divisor != 0:
+            height += self.size_divisor - (height % self.size_divisor)
+        if width % self.size_divisor != 0:
+            width += self.size_divisor - (width % self.size_divisor)
+
+        return resize(
+            image,
+            size=(height, width),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        size_divisor: int = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            size_divisor (`int`, *optional*, defaults to `32`):
+                Ensures height and width are rounded to a multiple of this value after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["TextNetImageProcessor"]
diff --git a/src/transformers/models/textnet/modeling_textnet.py b/src/transformers/models/textnet/modeling_textnet.py
new file mode 100644
index 00000000000000..d32a7351af893f
--- /dev/null
+++ b/src/transformers/models/textnet/modeling_textnet.py
@@ -0,0 +1,487 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TextNet model."""
+
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.activations import ACT2CLS
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from transformers.models.textnet.configuration_textnet import TextNetConfig
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.backbone_utils import BackboneMixin
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "TextNetConfig"
+_CHECKPOINT_FOR_DOC = "czczup/textnet-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 20, 27]
+
+
+class TextNetConvLayer(nn.Module):
+    def __init__(self, config: TextNetConfig):
+        super().__init__()
+
+        self.kernel_size = config.stem_kernel_size
+        self.stride = config.stem_stride
+        self.activation_function = config.stem_act_func
+
+        padding = (
+            (config.kernel_size[0] // 2, config.kernel_size[1] // 2)
+            if isinstance(config.stem_kernel_size, tuple)
+            else config.stem_kernel_size // 2
+        )
+
+        self.conv = nn.Conv2d(
+            config.stem_num_channels,
+            config.stem_out_channels,
+            kernel_size=config.stem_kernel_size,
+            stride=config.stem_stride,
+            padding=padding,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm2d(config.stem_out_channels, config.batch_norm_eps)
+
+        self.activation = nn.Identity()
+        if self.activation_function is not None:
+            self.activation = ACT2CLS[self.activation_function]()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        return self.activation(hidden_states)
+
+
+class TextNetRepConvLayer(nn.Module):
+    r"""
+    This layer supports re-parameterization by combining multiple convolutional branches
+    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
+    At inference time, these branches can be collapsed into a single convolution for
+    efficiency, as per the re-parameterization paradigm.
+
+    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
+    """
+
+    def __init__(self, config: TextNetConfig, in_channels: int, out_channels: int, kernel_size: int, stride: int):
+        super().__init__()
+
+        self.num_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
+
+        self.activation_function = nn.ReLU()
+
+        self.main_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        self.main_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
+
+        vertical_padding = ((kernel_size[0] - 1) // 2, 0)
+        horizontal_padding = (0, (kernel_size[1] - 1) // 2)
+
+        if kernel_size[1] != 1:
+            self.vertical_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(kernel_size[0], 1),
+                stride=stride,
+                padding=vertical_padding,
+                bias=False,
+            )
+            self.vertical_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
+        else:
+            self.vertical_conv, self.vertical_batch_norm = None, None
+
+        if kernel_size[0] != 1:
+            self.horizontal_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(1, kernel_size[1]),
+                stride=stride,
+                padding=horizontal_padding,
+                bias=False,
+            )
+            self.horizontal_batch_norm = nn.BatchNorm2d(num_features=out_channels, eps=config.batch_norm_eps)
+        else:
+            self.horizontal_conv, self.horizontal_batch_norm = None, None
+
+        self.rbr_identity = (
+            nn.BatchNorm2d(num_features=in_channels, eps=config.batch_norm_eps)
+            if out_channels == in_channels and stride == 1
+            else None
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        main_outputs = self.main_conv(hidden_states)
+        main_outputs = self.main_batch_norm(main_outputs)
+
+        # applies a convolution with a vertical kernel
+        if self.vertical_conv is not None:
+            vertical_outputs = self.vertical_conv(hidden_states)
+            vertical_outputs = self.vertical_batch_norm(vertical_outputs)
+            main_outputs = main_outputs + vertical_outputs
+
+        # applies a convolution with a horizontal kernel
+        if self.horizontal_conv is not None:
+            horizontal_outputs = self.horizontal_conv(hidden_states)
+            horizontal_outputs = self.horizontal_batch_norm(horizontal_outputs)
+            main_outputs = main_outputs + horizontal_outputs
+
+        if self.rbr_identity is not None:
+            id_out = self.rbr_identity(hidden_states)
+            main_outputs = main_outputs + id_out
+
+        return self.activation_function(main_outputs)
+
+
+class TextNetStage(nn.Module):
+    def __init__(self, config: TextNetConfig, depth: int):
+        super().__init__()
+        kernel_size = config.conv_layer_kernel_sizes[depth]
+        stride = config.conv_layer_strides[depth]
+
+        num_layers = len(kernel_size)
+        stage_in_channel_size = config.hidden_sizes[depth]
+        stage_out_channel_size = config.hidden_sizes[depth + 1]
+
+        in_channels = [stage_in_channel_size] + [stage_out_channel_size] * (num_layers - 1)
+        out_channels = [stage_out_channel_size] * num_layers
+
+        stage = []
+        for stage_config in zip(in_channels, out_channels, kernel_size, stride):
+            stage.append(TextNetRepConvLayer(config, *stage_config))
+        self.stage = nn.ModuleList(stage)
+
+    def forward(self, hidden_state):
+        for block in self.stage:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class TextNetEncoder(nn.Module):
+    def __init__(self, config: TextNetConfig):
+        super().__init__()
+
+        stages = []
+        num_stages = len(config.conv_layer_kernel_sizes)
+        for stage_ix in range(num_stages):
+            stages.append(TextNetStage(config, stage_ix))
+
+        self.stages = nn.ModuleList(stages)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = [hidden_state]
+        for stage in self.stages:
+            hidden_state = stage(hidden_state)
+            hidden_states.append(hidden_state)
+
+        if not return_dict:
+            output = (hidden_state,)
+            return output + (hidden_states,) if output_hidden_states else output
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
+
+
+TEXTNET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TextNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TEXTNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`TextNetImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class TextNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = TextNetConfig
+    base_model_prefix = "textnet"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.BatchNorm2d):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@add_start_docstrings(
+    "The bare Textnet model outputting raw features without any specific head on top.",
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetModel(TextNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.stem = TextNetConvLayer(config)
+        self.encoder = TextNetEncoder(config)
+        self.pooler = nn.AdaptiveAvgPool2d((2, 2))
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> Union[Tuple[Any, List[Any]], Tuple[Any], BaseModelOutputWithPoolingAndNoAttention]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        hidden_state = self.stem(pixel_values)
+
+        encoder_outputs = self.encoder(
+            hidden_state, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+
+        if not return_dict:
+            output = (last_hidden_state, pooled_output)
+            return output + (encoder_outputs[1],) if output_hidden_states else output
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs[1] if output_hidden_states else None,
+        )
+
+
+@add_start_docstrings(
+    """
+    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetForImageClassification(TextNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.textnet = TextNetModel(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # classification head
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
+        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.textnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        last_hidden_state = outputs[0]
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    TEXTNET_START_DOCSTRING,
+)
+class TextNetBackbone(TextNetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.textnet = TextNetModel(config)
+        self.num_features = config.hidden_sizes
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TEXTNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> Union[Tuple[Tuple], BackboneOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
+        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> with torch.no_grad():
+        >>>     outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.textnet(pixel_values, output_hidden_states=True, return_dict=return_dict)
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[2]
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                hidden_states = outputs.hidden_states if return_dict else outputs[2]
+                output += (hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = ["TextNetBackbone", "TextNetModel", "TextNetPreTrainedModel", "TextNetForImageClassification"]
diff --git a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
index 691a2b2b76ec3f..9562e28a8b60a3 100644
--- a/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
+++ b/src/transformers/models/timm_wrapper/configuration_timm_wrapper.py
@@ -18,7 +18,11 @@
 from typing import Any, Dict
 
 from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ...utils import is_timm_available, logging
+
+
+if is_timm_available():
+    from timm.data import ImageNetInfo, infer_imagenet_subset
 
 
 logger = logging.get_logger(__name__)
@@ -33,6 +37,9 @@ class TimmWrapperConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
+    Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default
+    imagenet models is set to `None` due to occlusions in the label descriptions.
+
     Args:
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -60,10 +67,30 @@ def __init__(self, initializer_range: float = 0.02, do_pooling: bool = True, **k
 
     @classmethod
     def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
+        label_names = config_dict.get("label_names", None)
+        is_custom_model = "num_labels" in kwargs or "id2label" in kwargs
+
+        # if no labels added to config, use imagenet labeller in timm
+        if label_names is None and not is_custom_model:
+            imagenet_subset = infer_imagenet_subset(config_dict)
+            if imagenet_subset:
+                dataset_info = ImageNetInfo(imagenet_subset)
+                synsets = dataset_info.label_names()
+                label_descriptions = dataset_info.label_descriptions(as_dict=True)
+                label_names = [label_descriptions[synset] for synset in synsets]
+
+        if label_names is not None and not is_custom_model:
+            kwargs["id2label"] = dict(enumerate(label_names))
+
+            # if all label names are unique, create label2id mapping as well
+            if len(set(label_names)) == len(label_names):
+                kwargs["label2id"] = {name: i for i, name in enumerate(label_names)}
+            else:
+                kwargs["label2id"] = None
+
         # timm config stores the `num_classes` attribute in both the root of config and in the "pretrained_cfg" dict.
         # We are removing these attributes in order to have the native `transformers` num_labels attribute in config
         # and to avoid duplicate attributes
-
         num_labels_in_kwargs = kwargs.pop("num_labels", None)
         num_labels_in_dict = config_dict.pop("num_classes", None)
 
@@ -80,6 +107,9 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
     def to_dict(self) -> Dict[str, Any]:
         output = super().to_dict()
         output["num_classes"] = self.num_labels
+        output["label_names"] = list(self.id2label.values())
+        output.pop("id2label", None)
+        output.pop("label2id", None)
         return output
 
 
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
index 6a77b283395c49..2342e16da4a2e7 100644
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -55,6 +55,8 @@ class VideoLlavaConfig(PretrainedConfig):
             Sequence length of one image embedding.
         video_seq_length (`int`, *optional*, defaults to 2056):
             Sequence length of one video embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
 
     Example:
 
@@ -92,6 +94,7 @@ def __init__(
         vision_feature_layer=-2,
         image_seq_length=256,
         video_seq_length=2056,
+        multimodal_projector_bias=True,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -102,6 +105,7 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.image_seq_length = image_seq_length
         self.video_seq_length = video_seq_length
+        self.multimodal_projector_bias = multimodal_projector_bias
 
         self.vision_config = vision_config
 
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 4028f552c1bb24..4e978346176516 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -377,7 +377,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images/video frames. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 802304e6b19165..293fb10ae27795 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -88,10 +88,13 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput):
 class VideoLlavaMultiModalProjector(nn.Module):
     def __init__(self, config: VideoLlavaConfig):
         super().__init__()
-
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
         self.act = ACT2FN[config.projector_hidden_act]
-        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
 
     def forward(self, image_features):
         hidden_states = self.linear_1(image_features)
@@ -242,6 +245,9 @@ def __init__(self, config: VideoLlavaConfig):
         self.multi_modal_projector = VideoLlavaMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
 
@@ -263,17 +269,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_visual_features(
         self, visual_features, inputs_embeds, input_ids, attention_mask, labels, num_frames=1
     ):
@@ -538,127 +533,41 @@ def forward(
                 "time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
-                1
-            ).max() < self.config.image_seq_length
-            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
-                1
-            ).max() < self.config.video_seq_length
-            inputs_not_expanded = (img_token_not_enough and pixel_values_images is not None) or (
-                video_token_not_enough and pixel_values_videos is not None
-            )
-            pixels_present = input_ids.shape[-1] == 1 and (
-                pixel_values_images is not None or pixel_values_videos is not None
-            )
-            legacy_processing = inputs_not_expanded or pixels_present
-
-        image_features = None
         if pixel_values_images is not None:
             image_features = self.get_image_features(
                 pixel_values_images,
                 vision_feature_layer=vision_feature_layer,
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
+            n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        video_features = None
-        num_frames = 0
         if pixel_values_videos is not None:
             video_features, num_frames = self.get_video_features(
                 pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            if input_ids.shape[1] != 1:
-                for features, frames in ((image_features, 1), (video_features, num_frames)):
-                    if features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            labels,
-                            position_ids,
-                            input_ids,
-                        ) = self._merge_input_ids_with_visual_features(
-                            features,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            labels,
-                            num_frames=frames,
-                        )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        else:
-            if pixel_values_images is not None:
-                n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-                n_image_features = image_features.shape[0] * image_features.shape[1]
-                if n_image_tokens != n_image_features:
-                    raise ValueError(
-                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-            if pixel_values_videos is not None:
-                n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                n_video_features = video_features.shape[0] * video_features.shape[1]
-                if n_video_tokens != n_video_features:
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
+            n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+            n_video_features = video_features.shape[0] * video_features.shape[1]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                 )
-                video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 01a1a950346cb0..9d11f426851834 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -158,16 +158,8 @@ def __call__(
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         prompt_strings = text
-        if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
-            logger.warning_once(
-                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set "
-                "directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = "
-                "{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is "
-                "deprecated and will throw an error in v4.50."
-            )
-        # Replace the image/video tokens with the expanded token sequence
-        elif encoded_images is not None:
+
+        if encoded_images is not None:
             if "pixel_values_images" in encoded_images.keys():
                 height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values_images")[0]))
                 num_frames = 1
@@ -187,7 +179,7 @@ def __call__(
             ) + self.num_additional_image_tokens
             num_video_tokens = num_image_tokens * num_frames
             if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= self.num_additional_image_tokens
+                num_image_tokens -= 1
 
             prompt_strings = []
             for sample in text:
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 1322a69a642456..afba947bbdbfce 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -208,7 +208,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 86351eca1f6615..18890c0dbfafb8 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -438,7 +438,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index b4a6c6ae9bb9af..0daaa8327b631b 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -242,7 +242,12 @@ def __init__(self, config: VipLlavaConfig):
         self.multi_modal_projector = VipLlavaMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
         self.post_init()
 
     def get_input_embeddings(self):
@@ -263,16 +268,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     # Ignore copy
     def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: List[int]):
         """
@@ -455,80 +450,22 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
-        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
-            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
-
-        image_features = None
         if pixel_values is not None:
             image_features = self.get_image_features(
                 pixel_values=pixel_values, vision_feature_layers=vision_feature_layers
             )
 
-        if legacy_processing:
-            logger.warning_once(
-                "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
-            )
-            # prefill stage vs decoding stage (legacy behavior copied)
-            if input_ids.shape[1] != 1:
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
-            else:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # in the case one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
-
-        # TODO: @raushan retain only the new behavior after v4.47
-        elif image_features is not None:
             n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
             n_image_features = image_features.shape[0] * image_features.shape[1]
             if n_image_tokens != n_image_features:
                 raise ValueError(
                     f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                 )
-            special_image_mask = (
-                (input_ids == self.config.image_token_index)
-                .unsqueeze(-1)
-                .expand_as(inputs_embeds)
-                .to(inputs_embeds.device)
-            )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index abca01987c7293..55c759f8e9ae2e 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -237,6 +237,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def get_input_embeddings(self):
+        return self.decoder.get_input_embeddings()
+
     def get_output_embeddings(self):
         return self.decoder.get_output_embeddings()
 
@@ -659,12 +662,6 @@ def forward(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
-    def resize_token_embeddings(self, *args, **kwargs):
-        raise NotImplementedError(
-            "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the"
-            " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
-        )
-
     def _reorder_cache(self, past_key_values, beam_idx):
         # apply decoder cache reordering here
         return self.decoder._reorder_cache(past_key_values, beam_idx)
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 8e6232ac025a0c..afee64dc0e7872 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -247,7 +247,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
index 4c51e2a49b3b21..4c3b06e08815e2 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -223,7 +223,7 @@ def preprocess(
         images = [to_numpy_array(image) for image in images]
         trimaps = [to_numpy_array(trimap) for trimap in trimaps]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/vitpose/__init__.py b/src/transformers/models/vitpose/__init__.py
new file mode 100644
index 00000000000000..4a57524cce2143
--- /dev/null
+++ b/src/transformers/models/vitpose/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vitpose import *
+    from .image_processing_vitpose import *
+    from .modeling_vitpose import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
new file mode 100644
index 00000000000000..763c1f1bd7bdd7
--- /dev/null
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitPose model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitPoseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitPoseForPoseEstimation`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitPoseBackboneConfig()`):
+            The configuration of the backbone model. Currently, only `backbone_config` with `vitpose_backbone` as `model_type` is supported.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_factor (`int`, *optional*, defaults to 4):
+            Factor to upscale the feature maps coming from the ViT backbone.
+        use_simple_decoder (`bool`, *optional*, defaults to `True`):
+            Whether to use a `VitPoseSimpleDecoder` to decode the feature maps from the backbone into heatmaps. Otherwise it uses `VitPoseClassicDecoder`.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import VitPoseConfig, VitPoseForPoseEstimation
+
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitPoseForPoseEstimation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitpose"
+
+    def __init__(
+        self,
+        backbone_config: PretrainedConfig = None,
+        backbone: str = None,
+        use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
+        backbone_kwargs: dict = None,
+        initializer_range: float = 0.02,
+        scale_factor: int = 4,
+        use_simple_decoder: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if use_pretrained_backbone:
+            logger.info(
+                "`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value."
+            )
+        if use_timm_backbone:
+            raise ValueError("use_timm_backbone set `True` is not supported at the moment.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=[4])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+
+        self.initializer_range = initializer_range
+        self.scale_factor = scale_factor
+        self.use_simple_decoder = use_simple_decoder
+
+
+__all__ = ["VitPoseConfig"]
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
new file mode 100644
index 00000000000000..f151adebbce79e
--- /dev/null
+++ b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VitPose checkpoints from the original repository.
+
+URL: https://github.com/vitae-transformer/vitpose
+"""
+
+import argparse
+import os
+import re
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
+
+
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    r"patch_embed.proj": "embeddings.patch_embeddings.projection",
+    r"pos_embed": "embeddings.position_embeddings",
+    r"blocks": "encoder.layer",
+    r"attn.proj": "attention.output.dense",
+    r"attn": "attention.self",
+    r"norm1": "layernorm_before",
+    r"norm2": "layernorm_after",
+    r"last_norm": "layernorm",
+    r"keypoint_head": "head",
+    r"final_layer": "conv",
+}
+
+MODEL_TO_FILE_NAME_MAPPING = {
+    "vitpose-base-simple": "vitpose-b-simple.pth",
+    "vitpose-base": "vitpose-b.pth",
+    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
+    "vitpose-plus-base": "vitpose+_base.pth",
+}
+
+
+def get_config(model_name):
+    num_experts = 6 if "plus" in model_name else 1
+    part_features = 192 if "plus" in model_name else 0
+
+    backbone_config = VitPoseBackboneConfig(out_indices=[12], num_experts=num_experts, part_features=part_features)
+    # size of the architecture
+    if "small" in model_name:
+        backbone_config.hidden_size = 768
+        backbone_config.intermediate_size = 2304
+        backbone_config.num_hidden_layers = 8
+        backbone_config.num_attention_heads = 8
+    elif "large" in model_name:
+        backbone_config.hidden_size = 1024
+        backbone_config.intermediate_size = 4096
+        backbone_config.num_hidden_layers = 24
+        backbone_config.num_attention_heads = 16
+    elif "huge" in model_name:
+        backbone_config.hidden_size = 1280
+        backbone_config.intermediate_size = 5120
+        backbone_config.num_hidden_layers = 32
+        backbone_config.num_attention_heads = 16
+
+    use_simple_decoder = "simple" in model_name
+
+    edges = [
+        [15, 13],
+        [13, 11],
+        [16, 14],
+        [14, 12],
+        [11, 12],
+        [5, 11],
+        [6, 12],
+        [5, 6],
+        [5, 7],
+        [6, 8],
+        [7, 9],
+        [8, 10],
+        [1, 2],
+        [0, 1],
+        [0, 2],
+        [1, 3],
+        [2, 4],
+        [3, 5],
+        [4, 6],
+    ]
+    id2label = {
+        0: "Nose",
+        1: "L_Eye",
+        2: "R_Eye",
+        3: "L_Ear",
+        4: "R_Ear",
+        5: "L_Shoulder",
+        6: "R_Shoulder",
+        7: "L_Elbow",
+        8: "R_Elbow",
+        9: "L_Wrist",
+        10: "R_Wrist",
+        11: "L_Hip",
+        12: "R_Hip",
+        13: "L_Knee",
+        14: "R_Knee",
+        15: "L_Ankle",
+        16: "R_Ankle",
+    }
+
+    label2id = {v: k for k, v in id2label.items()}
+
+    config = VitPoseConfig(
+        backbone_config=backbone_config,
+        num_labels=17,
+        use_simple_decoder=use_simple_decoder,
+        edges=edges,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    return config
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    """
+    This function should be applied only once, on the concatenated keys to efficiently rename using
+    the key mappings.
+    """
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+
+
+# We will verify our results on a COCO image
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@torch.no_grad()
+def write_model(model_path, model_name, push_to_hub, check_logits=True):
+    os.makedirs(model_path, exist_ok=True)
+
+    # ------------------------------------------------------------
+    # Vision model params and config
+    # ------------------------------------------------------------
+
+    # params from config
+    config = get_config(model_name)
+
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
+
+    # load original state_dict
+    filename = MODEL_TO_FILE_NAME_MAPPING[model_name]
+    print(f"Fetching all parameters from the checkpoint at {filename}...")
+
+    checkpoint_path = hf_hub_download(
+        repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
+    )
+
+    print("Converting model...")
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
+    all_keys = list(original_state_dict.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+
+    dim = config.backbone_config.hidden_size
+
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        value = original_state_dict[key]
+
+        if re.search("associate_heads", new_key) or re.search("backbone.cls_token", new_key):
+            # This associated_heads is concept of auxiliary head so does not require in inference stage.
+            # backbone.cls_token is optional forward function for dynamically change of size, see detail in https://github.com/ViTAE-Transformer/ViTPose/issues/34
+            pass
+        elif re.search("qkv", new_key):
+            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
+            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
+            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
+        elif re.search("head", new_key) and not config.use_simple_decoder:
+            # Pattern for deconvolution layers
+            deconv_pattern = r"deconv_layers\.(0|3)\.weight"
+            new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
+            # Pattern for batch normalization layers
+            bn_patterns = [
+                (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
+                (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
+                (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
+                (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
+                (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
+            ]
+
+            for pattern, replacement in bn_patterns:
+                if re.search(pattern, new_key):
+                    # Convert the layer number to the correct batch norm index
+                    layer_num = int(re.search(pattern, key).group(1))
+                    bn_num = layer_num // 3 + 1
+                    new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
+            state_dict[new_key] = value
+        else:
+            state_dict[new_key] = value
+
+    print("Loading the checkpoint in a Vitpose model.")
+    model = VitPoseForPoseEstimation(config)
+    model.eval()
+    model.load_state_dict(state_dict)
+    print("Checkpoint loaded successfully.")
+
+    # create image processor
+    image_processor = VitPoseImageProcessor()
+
+    # verify image processor
+    image = prepare_img()
+    boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+    pixel_values = image_processor(images=image, boxes=boxes, return_tensors="pt").pixel_values
+
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
+    original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
+    assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
+
+    dataset_index = torch.tensor([0])
+
+    with torch.no_grad():
+        # first forward pass
+        outputs = model(pixel_values, dataset_index=dataset_index)
+        output_heatmap = outputs.heatmaps
+
+        # second forward pass (flipped)
+        # this is done since the model uses `flip_test=True` in its test config
+        pixel_values_flipped = torch.flip(pixel_values, [3])
+        outputs_flipped = model(
+            pixel_values_flipped,
+            dataset_index=dataset_index,
+            flip_pairs=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]),
+        )
+        output_flipped_heatmap = outputs_flipped.heatmaps
+
+    outputs.heatmaps = (output_heatmap + output_flipped_heatmap) * 0.5
+
+    # Verify pose_results
+    pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
+
+    if check_logits:
+        if model_name == "vitpose-base-simple":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98180511e02, 1.81808380e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.66642594e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.9807913e02, 1.8182812e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.8235235e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-base-coco-aic-mpii":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98305542e02, 1.81741592e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.69966745e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-plus-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98201294e02, 1.81728302e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.75046968e-01]),
+                atol=5e-2,
+            )
+        else:
+            raise ValueError("Model not supported")
+    print("Conversion successfully done.")
+
+    # save the model to a local directory
+    model.save_pretrained(model_path)
+    image_processor.save_pretrained(model_path)
+
+    if push_to_hub:
+        print(f"Pushing model and image processor for {model_name} to hub")
+        model.push_to_hub(f"danelcsb/{model_name}")
+        image_processor.push_to_hub(f"danelcsb/{model_name}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="vitpose-base-simple",
+        choices=MODEL_TO_FILE_NAME_MAPPING.keys(),
+        type=str,
+        help="Name of the VitPose model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        default=True,
+        type=bool,
+        help="Whether to check the logits of public converted model to the 🤗 hub. You can disable when using custom model.",
+    )
+
+    args = parser.parse_args()
+    write_model(
+        model_path=args.pytorch_dump_folder_path,
+        model_name=args.model_name,
+        push_to_hub=args.push_to_hub,
+        check_logits=args.check_logits,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
new file mode 100644
index 00000000000000..e7c5c524cb056d
--- /dev/null
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -0,0 +1,684 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for VitPose."""
+
+import itertools
+import math
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_scipy_available, is_torch_available, is_vision_available, logging
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    from scipy.linalg import inv
+    from scipy.ndimage import affine_transform, gaussian_filter
+
+if TYPE_CHECKING:
+    from .modeling_vitpose import VitPoseEstimatorOutput
+
+logger = logging.get_logger(__name__)
+
+
+# inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
+def box_to_center_and_scale(
+    box: Union[Tuple, List, np.ndarray],
+    image_width: int,
+    image_height: int,
+    normalize_factor: float = 200.0,
+    padding_factor: float = 1.25,
+):
+    """
+    Encodes a bounding box in COCO format into (center, scale).
+
+    Args:
+        box (`Tuple`, `List`, or `np.ndarray`):
+            Bounding box in COCO format (top_left_x, top_left_y, width, height).
+        image_width (`int`):
+            Image width.
+        image_height (`int`):
+            Image height.
+        normalize_factor (`float`):
+            Width and height scale factor.
+        padding_factor (`float`):
+            Bounding box padding factor.
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
+        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
+    """
+
+    top_left_x, top_left_y, width, height = box[:4]
+    aspect_ratio = image_width / image_height
+    center = np.array([top_left_x + width * 0.5, top_left_y + height * 0.5], dtype=np.float32)
+
+    if width > aspect_ratio * height:
+        height = width * 1.0 / aspect_ratio
+    elif width < aspect_ratio * height:
+        width = height * aspect_ratio
+
+    scale = np.array([width / normalize_factor, height / normalize_factor], dtype=np.float32)
+    scale = scale * padding_factor
+
+    return center, scale
+
+
+def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
+    """
+    Converts bounding boxes from the COCO format to the Pascal VOC format.
+
+    In other words, converts from (top_left_x, top_left_y, width, height) format
+    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
+
+    Args:
+        bboxes (`np.ndarray` of shape `(batch_size, 4)):
+            Bounding boxes in COCO format.
+
+    Returns:
+        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
+    """
+    bboxes[:, 2] = bboxes[:, 2] + bboxes[:, 0] - 1
+    bboxes[:, 3] = bboxes[:, 3] + bboxes[:, 1] - 1
+
+    return bboxes
+
+
+def get_keypoint_predictions(heatmaps: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get keypoint predictions from score maps.
+
+    Args:
+        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - coords (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
+            Predicted keypoint location.
+        - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
+            Scores (confidence) of the keypoints.
+    """
+    if not isinstance(heatmaps, np.ndarray):
+        raise ValueError("Heatmaps should be np.ndarray")
+    if heatmaps.ndim != 4:
+        raise ValueError("Heatmaps should be 4-dimensional")
+
+    batch_size, num_keypoints, _, width = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    scores = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % width
+    preds[:, :, 1] = preds[:, :, 1] // width
+
+    preds = np.where(np.tile(scores, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, scores
+
+
+def post_dark_unbiased_data_processing(coords: np.ndarray, batch_heatmaps: np.ndarray, kernel: int = 3) -> np.ndarray:
+    """DARK post-pocessing. Implemented by unbiased_data_processing.
+
+    Paper references:
+    - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    - Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        coords (`np.ndarray` of shape `(num_persons, num_keypoints, 2)`):
+            Initial coordinates of human pose.
+        batch_heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Batched heatmaps as predicted by the model.
+            A batch_size of 1 is used for the bottom up paradigm where all persons share the same heatmap.
+            A batch_size of `num_persons` is used for the top down paradigm where each person has its own heatmaps.
+        kernel (`int`, *optional*, defaults to 3):
+            Gaussian kernel size (K) for modulation.
+
+    Returns:
+        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
+            Refined coordinates.
+    """
+    batch_size, num_keypoints, height, width = batch_heatmaps.shape
+    num_coords = coords.shape[0]
+    if not (batch_size == 1 or batch_size == num_coords):
+        raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
+    radius = int((kernel - 1) // 2)
+    batch_heatmaps = np.array(
+        [
+            [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
+            for heatmaps in batch_heatmaps
+        ]
+    )
+    batch_heatmaps = np.clip(batch_heatmaps, 0.001, 50)
+    batch_heatmaps = np.log(batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
+
+    # calculate indices for coordinates
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (width + 2)
+    index += (width + 2) * (height + 2) * np.arange(0, batch_size * num_keypoints).reshape(-1, num_keypoints)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + width + 2]
+    ix1y1 = batch_heatmaps_pad[index + width + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - width - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - width]
+
+    # calculate refined coordinates using Newton's method
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(num_coords, num_keypoints, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(num_coords, num_keypoints, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
+    return coords
+
+
+def transform_preds(coords: np.ndarray, center: np.ndarray, scale: np.ndarray, output_size: np.ndarray) -> np.ndarray:
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (`np.ndarray` of shape `(num_keypoints, ndims)`):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (`np.ndarray` of shape `(2,)`):
+            Center of the bounding box (x, y).
+        scale (`np.ndarray` of shape `(2,)`):
+            Scale of the bounding box wrt original image of width and height.
+        output_size (`np.ndarray` of shape `(2,)`):
+            Size of the destination heatmaps in (height, width) format.
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    if coords.shape[1] not in (2, 4, 5):
+        raise ValueError("Coordinates need to have either 2, 4 or 5 dimensions.")
+    if len(center) != 2:
+        raise ValueError("Center needs to have 2 elements, one for x and one for y.")
+    if len(scale) != 2:
+        raise ValueError("Scale needs to consist of a width and height")
+    if len(output_size) != 2:
+        raise ValueError("Output size needs to consist of a height and width")
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    # We use unbiased data processing
+    scale_y = scale[1] / (output_size[0] - 1.0)
+    scale_x = scale[0] / (output_size[1] - 1.0)
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray, size_target: np.ndarray):
+    """
+    Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
+    Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+    Args:
+        theta (`float`):
+            Rotation angle in degrees.
+        size_input (`np.ndarray`):
+            Size of input image [width, height].
+        size_dst (`np.ndarray`):
+            Size of output image [width, height].
+        size_target (`np.ndarray`):
+            Size of ROI in input plane [w, h].
+
+    Returns:
+        `np.ndarray`: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * math.cos(theta) + 0.5 * size_input[1] * math.sin(theta) + 0.5 * size_target[0]
+    )
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * math.sin(theta) - 0.5 * size_input[1] * math.cos(theta) + 0.5 * size_target[1]
+    )
+    return matrix
+
+
+def scipy_warp_affine(src, M, size):
+    """
+    This function implements cv2.warpAffine function using affine_transform in scipy. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.affine_transform.html and https://docs.opencv.org/4.x/d4/d61/tutorial_warp_affine.html for more details.
+
+    Note: the original implementation of cv2.warpAffine uses cv2.INTER_LINEAR.
+    """
+    channels = [src[..., i] for i in range(src.shape[-1])]
+
+    # Convert to a 3x3 matrix used by SciPy
+    M_scipy = np.vstack([M, [0, 0, 1]])
+    # If you have a matrix for the ‘push’ transformation, use its inverse (numpy.linalg.inv) in this function.
+    M_inv = inv(M_scipy)
+    M_inv[0, 0], M_inv[0, 1], M_inv[1, 0], M_inv[1, 1], M_inv[0, 2], M_inv[1, 2] = (
+        M_inv[1, 1],
+        M_inv[1, 0],
+        M_inv[0, 1],
+        M_inv[0, 0],
+        M_inv[1, 2],
+        M_inv[0, 2],
+    )
+
+    new_src = [affine_transform(channel, M_inv, output_shape=size, order=1) for channel in channels]
+    new_src = np.stack(new_src, axis=-1)
+    return new_src
+
+
+class VitPoseImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VitPose image processor.
+
+    Args:
+        do_affine_transform (`bool`, *optional*, defaults to `True`):
+            Whether to apply an affine transformation to the input images.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 192}`):
+            Resolution of the image after `affine_transform` is applied. Only has an effect if `do_affine_transform` is set to `True`. Can
+            be overriden by `size` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`, *optional*):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`, *optional*):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_affine_transform: bool = True,
+        size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_affine_transform = do_affine_transform
+        self.size = size if size is not None else {"height": 256, "width": 192}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.normalize_factor = 200.0
+
+    def affine_transform(
+        self,
+        image: np.array,
+        center: Tuple[float],
+        scale: Tuple[float],
+        rotation: float,
+        size: Dict[str, int],
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Apply an affine transformation to an image.
+
+        Args:
+            image (`np.array`):
+                Image to transform.
+            center (`Tuple[float]`):
+                Center of the bounding box (x, y).
+            scale (`Tuple[float]`):
+                Scale of the bounding box with respect to height/width.
+            rotation (`float`):
+                Rotation angle in degrees.
+            size (`Dict[str, int]`):
+                Size of the destination image.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format of the output image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image.
+        """
+
+        data_format = input_data_format if data_format is None else data_format
+
+        size = (size["width"], size["height"])
+
+        # one uses a pixel standard deviation of 200 pixels
+        transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
+
+        # input image requires channels last format
+        image = (
+            image
+            if input_data_format == ChannelDimension.LAST
+            else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+        )
+        image = scipy_warp_affine(src=image, M=transformation, size=(size[1], size[0]))
+
+        image = to_channel_dimension_format(image, data_format, ChannelDimension.LAST)
+
+        return image
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        boxes: Union[List[List[float]], np.ndarray],
+        do_affine_transform: bool = None,
+        size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
+
+            do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
+                Whether to apply an affine transformation to the input images.
+            size (`Dict[str, int]` *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        do_affine_transform = do_affine_transform if do_affine_transform is not None else self.do_affine_transform
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if isinstance(boxes, list) and len(images) != len(boxes):
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
+        elif isinstance(boxes, np.ndarray) and len(images) != boxes.shape[0]:
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # transformations (affine transformation + rescaling + normalization)
+        if self.do_affine_transform:
+            new_images = []
+            for image, image_boxes in zip(images, boxes):
+                for box in image_boxes:
+                    center, scale = box_to_center_and_scale(
+                        box,
+                        image_width=size["width"],
+                        image_height=size["height"],
+                        normalize_factor=self.normalize_factor,
+                    )
+                    transformed_image = self.affine_transform(
+                        image, center, scale, rotation=0, size=size, input_data_format=input_data_format
+                    )
+                    new_images.append(transformed_image)
+            images = new_images
+
+        # For batch processing, the number of boxes must be consistent across all images in the batch.
+        # When using a list input, the number of boxes can vary dynamically per image.
+        # The image processor creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
+
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    def keypoints_from_heatmaps(
+        self,
+        heatmaps: np.ndarray,
+        center: np.ndarray,
+        scale: np.ndarray,
+        kernel: int = 11,
+    ):
+        """
+        Get final keypoint predictions from heatmaps and transform them back to
+        the image.
+
+        Args:
+            heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width])`):
+                Model predicted heatmaps.
+            center (`np.ndarray` of shape `(batch_size, 2)`):
+                Center of the bounding box (x, y).
+            scale (`np.ndarray` of shape `(batch_size, 2)`):
+                Scale of the bounding box wrt original images of width and height.
+            kernel (int, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
+                K=17 for sigma=3 and k=11 for sigma=2.
+
+        Returns:
+            tuple: A tuple containing keypoint predictions and scores.
+
+            - preds (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
+                Predicted keypoint location in images.
+            - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
+                Scores (confidence) of the keypoints.
+        """
+        batch_size, _, height, width = heatmaps.shape
+
+        coords, scores = get_keypoint_predictions(heatmaps)
+
+        preds = post_dark_unbiased_data_processing(coords, heatmaps, kernel=kernel)
+
+        # Transform back to the image
+        for i in range(batch_size):
+            preds[i] = transform_preds(preds[i], center=center[i], scale=scale[i], output_size=[height, width])
+
+        return preds, scores
+
+    def post_process_pose_estimation(
+        self,
+        outputs: "VitPoseEstimatorOutput",
+        boxes: Union[List[List[List[float]]], np.ndarray],
+        kernel_size: int = 11,
+        threshold: float = None,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+    ):
+        """
+        Transform the heatmaps into keypoint predictions and transform them back to the image.
+
+        Args:
+            outputs (`VitPoseEstimatorOutput`):
+                VitPoseForPoseEstimation model outputs.
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
+            kernel_size (`int`, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation.
+            threshold (`float`, *optional*, defaults to None):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
+        Returns:
+            `List[List[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
+            in the batch as predicted by the model.
+        """
+
+        # First compute centers and scales for each bounding box
+        batch_size, num_keypoints, _, _ = outputs.heatmaps.shape
+
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        centers = np.zeros((batch_size, 2), dtype=np.float32)
+        scales = np.zeros((batch_size, 2), dtype=np.float32)
+        flattened_boxes = list(itertools.chain(*boxes))
+        for i in range(batch_size):
+            if target_sizes is not None:
+                image_width, image_height = target_sizes[i][0], target_sizes[i][1]
+                scale_factor = np.array([image_width, image_height, image_width, image_height])
+                flattened_boxes[i] = flattened_boxes[i] * scale_factor
+            width, height = self.size["width"], self.size["height"]
+            center, scale = box_to_center_and_scale(flattened_boxes[i], image_width=width, image_height=height)
+            centers[i, :] = center
+            scales[i, :] = scale
+
+        preds, scores = self.keypoints_from_heatmaps(
+            outputs.heatmaps.cpu().numpy(), centers, scales, kernel=kernel_size
+        )
+
+        all_boxes = np.zeros((batch_size, 4), dtype=np.float32)
+        all_boxes[:, 0:2] = centers[:, 0:2]
+        all_boxes[:, 2:4] = scales[:, 0:2]
+
+        poses = torch.tensor(preds)
+        scores = torch.tensor(scores)
+        labels = torch.arange(0, num_keypoints)
+        bboxes_xyxy = torch.tensor(coco_to_pascal_voc(all_boxes))
+
+        results: List[List[Dict[str, torch.Tensor]]] = []
+
+        pose_bbox_pairs = zip(poses, scores, bboxes_xyxy)
+
+        for image_bboxes in boxes:
+            image_results: List[Dict[str, torch.Tensor]] = []
+            for _ in image_bboxes:
+                # Unpack the next pose and bbox_xyxy from the iterator
+                pose, score, bbox_xyxy = next(pose_bbox_pairs)
+                score = score.squeeze()
+                keypoints_labels = labels
+                if threshold is not None:
+                    keep = score > threshold
+                    pose = pose[keep]
+                    score = score[keep]
+                    keypoints_labels = keypoints_labels[keep]
+                pose_result = {"keypoints": pose, "scores": score, "labels": keypoints_labels, "bbox": bbox_xyxy}
+                image_results.append(pose_result)
+            results.append(image_results)
+
+        return results
+
+
+__all__ = ["VitPoseImageProcessor"]
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
new file mode 100644
index 00000000000000..b5dd274654ac1b
--- /dev/null
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VitPose model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_vitpose import VitPoseConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitPoseConfig"
+
+
+@dataclass
+class VitPoseEstimatorOutput(ModelOutput):
+    """
+    Class for outputs of pose estimation models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
+        heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
+            Heatmaps as predicted by the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    heatmaps: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class VitPosePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitPoseConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+VITPOSE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitPoseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VitPoseImageProcessor`]. See
+            [`VitPoseImageProcessor.__call__`] for details.
+
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
+
+            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
+
+        flip_pairs (`torch.tensor`, *optional*):
+            Whether to mirror pairs of keypoints (for example, left ear -- right ear).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def flip_back(output_flipped, flip_pairs, target_type="gaussian-heatmap"):
+    """Flip the flipped heatmaps back to the original form.
+
+    Args:
+        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
+            The output heatmaps obtained from the flipped images.
+        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
+            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
+        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
+            Target type to use. Can be gaussian-heatmap or combined-target.
+            gaussian-heatmap: Classification target with gaussian distribution.
+            combined-target: The combination of classification target (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        torch.Tensor: heatmaps that flipped back to the original image
+    """
+    if target_type not in ["gaussian-heatmap", "combined-target"]:
+        raise ValueError("target_type should be gaussian-heatmap or combined-target")
+
+    if output_flipped.ndim != 4:
+        raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
+    batch_size, num_keypoints, height, width = output_flipped.shape
+    channels = 1
+    if target_type == "combined-target":
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(batch_size, -1, channels, height, width)
+    output_flipped_back = output_flipped.clone()
+
+    # Swap left-right parts
+    for left, right in flip_pairs.tolist():
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape((batch_size, num_keypoints, height, width))
+    # Flip horizontally
+    output_flipped_back = output_flipped_back.flip(-1)
+    return output_flipped_back
+
+
+class VitPoseSimpleDecoder(nn.Module):
+    """
+    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
+    feature maps into heatmaps.
+    """
+
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        self.activation = nn.ReLU()
+        self.upsampling = nn.Upsample(scale_factor=config.scale_factor, mode="bilinear", align_corners=False)
+        self.conv = nn.Conv2d(
+            config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Transform input: ReLU + upsample
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.upsampling(hidden_state)
+        heatmaps = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps, flip_pairs)
+
+        return heatmaps
+
+
+class VitPoseClassicDecoder(nn.Module):
+    """
+    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
+    turning the feature maps into heatmaps.
+    """
+
+    def __init__(self, config: VitPoseConfig):
+        super().__init__()
+
+        self.deconv1 = nn.ConvTranspose2d(
+            config.backbone_config.hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False
+        )
+        self.batchnorm1 = nn.BatchNorm2d(256)
+        self.relu1 = nn.ReLU()
+
+        self.deconv2 = nn.ConvTranspose2d(256, 256, kernel_size=4, stride=2, padding=1, bias=False)
+        self.batchnorm2 = nn.BatchNorm2d(256)
+        self.relu2 = nn.ReLU()
+
+        self.conv = nn.Conv2d(256, config.num_labels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None):
+        hidden_state = self.deconv1(hidden_state)
+        hidden_state = self.batchnorm1(hidden_state)
+        hidden_state = self.relu1(hidden_state)
+
+        hidden_state = self.deconv2(hidden_state)
+        hidden_state = self.batchnorm2(hidden_state)
+        hidden_state = self.relu2(hidden_state)
+
+        heatmaps = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps, flip_pairs)
+
+        return heatmaps
+
+
+@add_start_docstrings(
+    "The VitPose model with a pose estimation head on top.",
+    VITPOSE_START_DOCSTRING,
+)
+class VitPoseForPoseEstimation(VitPosePreTrainedModel):
+    def __init__(self, config: VitPoseConfig) -> None:
+        super().__init__(config)
+
+        self.backbone = load_backbone(config)
+
+        # add backbone attributes
+        if not hasattr(self.backbone.config, "hidden_size"):
+            raise ValueError("The backbone should have a hidden_size attribute")
+        if not hasattr(self.backbone.config, "image_size"):
+            raise ValueError("The backbone should have an image_size attribute")
+        if not hasattr(self.backbone.config, "patch_size"):
+            raise ValueError("The backbone should have a patch_size attribute")
+
+        self.head = VitPoseSimpleDecoder(config) if config.use_simple_decoder else VitPoseClassicDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VitPoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        flip_pairs: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, VitPoseEstimatorOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> heatmaps = outputs.heatmaps
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not yet supported")
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values,
+            dataset_index=dataset_index,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
+        sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
+        batch_size = sequence_output.shape[0]
+        patch_height = self.config.backbone_config.image_size[0] // self.config.backbone_config.patch_size[0]
+        patch_width = self.config.backbone_config.image_size[1] // self.config.backbone_config.patch_size[1]
+        sequence_output = (
+            sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
+        )
+
+        heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (heatmaps,) + outputs[1:]
+            else:
+                output = (heatmaps,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return VitPoseEstimatorOutput(
+            loss=loss,
+            heatmaps=heatmaps,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VitPosePreTrainedModel", "VitPoseForPoseEstimation"]
diff --git a/src/transformers/models/vitpose_backbone/__init__.py b/src/transformers/models/vitpose_backbone/__init__.py
new file mode 100644
index 00000000000000..f69ff8762af0e3
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/__init__.py
@@ -0,0 +1,54 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_vitpose_backbone": ["VitPoseBackboneConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vitpose_backbone"] = [
+        "VitPoseBackbonePreTrainedModel",
+        "VitPoseBackbone",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_vitpose_backbone import VitPoseBackboneConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vitpose_backbone import (
+            VitPoseBackbone,
+            VitPoseBackbonePreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
new file mode 100644
index 00000000000000..2872d39a2a30df
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitPose backbone configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitPoseBackbone`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to `[256, 192]`):
+            The size (resolution) of each image.
+        patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            The ratio of the hidden size in the feedforward network to the hidden size in the attention layers.
+        num_experts (`int`, *optional*, defaults to 1):
+            The number of experts in the MoE layer.
+        part_features (`int`, *optional*):
+            The number of part features to output. Only used in case `num_experts` is greater than 1.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseBackboneConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitPoseBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitpose_backbone"
+
+    def __init__(
+        self,
+        image_size=[256, 192],
+        patch_size=[16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        num_experts=1,
+        part_features=256,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        qkv_bias=True,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.num_experts = num_experts
+        self.part_features = part_features
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
new file mode 100644
index 00000000000000..d3f057988370f0
--- /dev/null
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VitPose backbone model.
+
+This code is the same as the original Vision Transformer (ViT) with 2 modifications:
+- use of padding=2 in the patch embedding layer
+- addition of a mixture-of-experts MLP layer
+"""
+
+import collections.abc
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitpose_backbone import VitPoseBackboneConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitPoseBackboneConfig"
+
+
+class VitPoseBackbonePatchEmbeddings(nn.Module):
+    """Image to Patch Embedding."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        image_size = config.image_size
+        patch_size = config.patch_size
+        num_channels = config.num_channels
+        embed_dim = config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        height, width = pixel_values.shape[-2:]
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values)
+
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class VitPoseBackboneEmbeddings(nn.Module):
+    """
+    Construct the position and patch embeddings.
+    """
+
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+
+        self.patch_embeddings = VitPoseBackbonePatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        embeddings = self.patch_embeddings(pixel_values)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->VitPoseBackbone
+class VitPoseBackboneSelfAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VitPoseBackbone
+class VitPoseBackboneSelfOutput(nn.Module):
+    """
+    The residual connection is defined in VitPoseBackboneLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VitPoseBackbone
+class VitPoseBackboneAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.attention = VitPoseBackboneSelfAttention(config)
+        self.output = VitPoseBackboneSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VitPoseBackboneMoeMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig):
+        super().__init__()
+
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+
+        num_experts = config.num_experts
+        part_features = config.part_features
+
+        self.part_features = part_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features - part_features)
+        self.drop = nn.Dropout(config.hidden_dropout_prob)
+
+        self.num_experts = num_experts
+        experts = [nn.Linear(hidden_features, part_features) for _ in range(num_experts)]
+        self.experts = nn.ModuleList(experts)
+
+    def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+        expert_hidden_state = torch.zeros_like(hidden_state[:, :, -self.part_features :])
+
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        shared_hidden_state = self.fc2(hidden_state)
+        indices = indices.view(-1, 1, 1)
+
+        # to support ddp training
+        for i in range(self.num_experts):
+            selected_index = indices == i
+            current_hidden_state = self.experts[i](hidden_state) * selected_index
+            expert_hidden_state = expert_hidden_state + current_hidden_state
+
+        hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)
+
+        return hidden_state
+
+
+class VitPoseBackboneMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        self.activation = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class VitPoseBackboneLayer(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.attention = VitPoseBackboneAttention(config)
+        self.mlp = VitPoseBackboneMLP(config) if self.num_experts == 1 else VitPoseBackboneMoeMLP(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        # Validate dataset_index when using multiple experts
+        if self.num_experts > 1 and dataset_index is None:
+            raise ValueError(
+                "dataset_index must be provided when using multiple experts "
+                f"(num_experts={self.num_experts}). Please provide dataset_index "
+                "to the forward pass."
+            )
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in VitPoseBackbone, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        layer_output = self.layernorm_after(hidden_states)
+        if self.num_experts == 1:
+            layer_output = self.mlp(layer_output)
+        else:
+            layer_output = self.mlp(layer_output, indices=dataset_index)
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VitPoseBackbone
+class VitPoseBackboneEncoder(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VitPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    dataset_index,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, dataset_index, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VitPoseBackbonePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitPoseBackboneConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["VitPoseBackboneEmbeddings", "VitPoseBackboneLayer"]
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, VitPoseBackboneEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+
+VITPOSE_BACKBONE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitPoseBackboneConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_BACKBONE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values.
+
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
+
+            This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The VitPose backbone useful for downstream tasks.",
+    VITPOSE_BACKBONE_START_DOCSTRING,
+)
+class VitPoseBackbone(VitPoseBackbonePreTrainedModel, BackboneMixin):
+    def __init__(self, config: VitPoseBackboneConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = VitPoseBackboneEmbeddings(config)
+        self.encoder = VitPoseBackboneEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITPOSE_BACKBONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+        >>> import torch
+
+        >>> config = VitPoseBackboneConfig(out_indices=[-1])
+        >>> model = VitPoseBackbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 256, 192)
+        >>> dataset_index = torch.tensor([1])
+        >>> outputs = model(pixel_values, dataset_index)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            dataset_index=dataset_index,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                hidden_state = self.layernorm(hidden_state)
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 19a97f9c645acb..1343d97a91a03c 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -262,7 +262,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        if is_scaled_image(image) and do_rescale:
+        if do_rescale and is_scaled_image(image):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 360c0c0b687bab..1b4ecb831bf98f 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -480,8 +480,8 @@ def generate(
                 `return_segments` is set True. In this case the generation outputs of each segment is added to each
                 segment.
             force_unique_generate_call (`bool`, *optional*):
-                Whether to force a unique call to the underlying GenerationMixin's [~generation.GenerationMixin.generate] method. This is useful for assisted decoding and testing purposes to ensure
-                that only one call to [~generation.GenerationMixin.generate] is made and therefore decoder input token ids and eos token ids are returned.
+                Whether to force a unique call to the underlying GenerationMixin's [`~generation.GenerationMixin.generate`] method. This is useful for assisted decoding and testing purposes to ensure
+                that only one call to [`~generation.GenerationMixin.generate`] is made and therefore decoder input token ids and eos token ids are returned.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -503,10 +503,10 @@ def generate(
                 - `start`: the start timestamp of the segment.
                 - `end`: the end timestamp of the segment.
                 - `tokens`: the tokens of the segment, excluding the decoder input ids and end of sequence id.
-                - `idxs`: the start (included) and end (excluded) indices of the `tokens` of the segment in the underlying call to GenerationMixin's [~generation.GenerationMixin.generate] (present in `result`).
-                - `result`: the result of the underlying call to GenerationMixin's [~generation.GenerationMixin.generate].
+                - `idxs`: the start (included) and end (excluded) indices of the `tokens` of the segment in the underlying call to GenerationMixin's [`~generation.GenerationMixin.generate`] (present in `result`).
+                - `result`: the result of the underlying call to GenerationMixin's [`~generation.GenerationMixin.generate`].
 
-                When `return_timestamps=True`, `return_dict_in_generate=True` applies to each call of the underlying GenerationMixin's [~generation.GenerationMixin.generate], with outputs stored in `result` of each `segment`.
+                When `return_timestamps=True`, `return_dict_in_generate=True` applies to each call of the underlying GenerationMixin's [`~generation.GenerationMixin.generate`], with outputs stored in `result` of each `segment`.
 
         Example:
 
@@ -543,7 +543,7 @@ def generate(
         ```
 
         - *Shortform transcription*: If passed mel input features are <= 30 seconds, there are two possibilities:
-            - `return_timestamps=False`: the whole audio will be transcribed with a single call to GenerationMixin's [~generation.GenerationMixin.generate].
+            - `return_timestamps=False`: the whole audio will be transcribed with a single call to GenerationMixin's [`~generation.GenerationMixin.generate`].
             - `return_timestamps=True`: the audio will be transcribed using the same logic as long-form transcription.
 
         ```python
@@ -1571,7 +1571,7 @@ def detect_language(
         )
 
         with torch.no_grad():
-            logits = self(**inputs, decoder_input_ids=decoder_input_ids).logits[:, -1]
+            logits = self(**inputs, decoder_input_ids=decoder_input_ids, use_cache=False).logits[:, -1]
 
         non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
         non_lang_mask[list(generation_config.lang_to_id.values())] = False
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 9cc0b7c530f4c1..9a2c65254403a6 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -22,7 +22,7 @@
 from typing import List, Optional, Tuple
 
 import numpy as np
-from tokenizers import AddedToken, pre_tokenizers, processors
+from tokenizers import AddedToken, processors
 
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
@@ -128,19 +128,12 @@ def __init__(
 
         self.add_bos_token = kwargs.pop("add_bos_token", False)
 
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
         if normalizer_file is not None:
             with open(normalizer_file, encoding="utf-8") as vocab_handle:
                 self.english_spelling_normalizer = json.load(vocab_handle)
         else:
             self.english_spelling_normalizer = None
 
-        self.add_prefix_space = add_prefix_space
         self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
 
         self.language = language
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index bce1ea02e7b9e0..4cf2af1da5c0ef 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -203,6 +203,13 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ) -> torch.Tensor:
         seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
 
         if position_ids is None:
             position_ids = self.position_ids[:, :seq_length]
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 92cd9a218afb1e..0a3e0812a42a96 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -1119,6 +1119,13 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
         if attention_mask is None:
             attention_mask = input_ids.new_ones(input_shape)
 
+        # Create missing `position_ids` on the fly
+        position_ids = None
+        if model_kwargs.get("position_ids") is None:
+            position_ids = create_position_ids_from_input_ids(
+                input_ids, padding_idx=self.config.pad_token_id
+            )  # placed in kwargs for further processing (see below)
+
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[2]
@@ -1131,8 +1138,15 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
                 remove_prefix_length = input_ids.shape[1] - 1
 
             input_ids = input_ids[:, remove_prefix_length:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+            if position_ids is not None:
+                position_ids = position_ids[:, remove_prefix_length:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+        }
 
     def _reorder_cache(self, past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 38a99cbea52ae0..3c5209500211c9 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -1343,7 +1343,7 @@ def preprocess(
         # All transformations expect numpy arrays
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py
index bb2638ee9114e2..761c799bdcdc41 100644
--- a/src/transformers/models/zamba/modeling_zamba.py
+++ b/src/transformers/models/zamba/modeling_zamba.py
@@ -20,7 +20,7 @@
 """PyTorch Zamba model."""
 
 import math
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -33,18 +33,18 @@
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
-from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -113,7 +113,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-class HybridMambaAttentionDynamicCache(DynamicCache):
+class ZambaHybridDynamicCache(DynamicCache):
     """
     A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
     (which has a constant shape regardless of seq_len).
@@ -131,9 +131,9 @@ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         self.dtype = dtype
         self.layers_block_type = config.layers_block_type
         self.has_previous_state = False  # only used by mamba
-        intermediate_size = config.mamba_expand * config.hidden_size
-        ssm_state_size = config.mamba_d_state
-        conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
         self.n_mamba_heads = config.n_mamba_heads
         self.conv_states = []
         self.ssm_states = []
@@ -143,9 +143,14 @@ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         self._buffers = {}
         for i in range(config.num_hidden_layers):
             self.conv_states += [
-                torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+                torch.zeros(batch_size, self.intermediate_size, self.conv_kernel_size, device=device, dtype=dtype)
             ]
-            cache_shape = (batch_size, self.n_mamba_heads, intermediate_size // self.n_mamba_heads, ssm_state_size)
+            cache_shape = (
+                batch_size,
+                self.n_mamba_heads,
+                self.intermediate_size // self.n_mamba_heads,
+                self.ssm_state_size,
+            )
             self.ssm_states += [torch.zeros(cache_shape, device=device, dtype=dtype)]
             if self.layers_block_type[i] == "hybrid":
                 self.transformer_layers.append(i)
@@ -194,14 +199,38 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
             return 0
         return self.key_cache[layer_idx].shape[-2]
 
-    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.to_legacy_cache
     def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
-        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+        raise NotImplementedError("ZambaHybridDynamicCache does not have a legacy cache equivalent.")
 
     @classmethod
-    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.from_legacy_cache
     def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
-        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+        raise NotImplementedError("ZambaHybridDynamicCache does not have a legacy cache equivalent.")
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
 
 
 class ZambaAttention(nn.Module):
@@ -218,277 +247,67 @@ class ZambaAttention(nn.Module):
     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
     """
 
-    def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
+    def __init__(self, config: ZambaConfig, layer_idx: int):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 
-        self.hidden_size = config.hidden_size
         self.attention_hidden_size = config.attention_hidden_size
-        self.num_heads = config.num_attention_heads
         self.head_dim = config.attention_head_dim
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
+        self.scaling = (self.head_dim / 2) ** -0.5
         self.is_causal = True
         self.attention_dropout = config.attention_dropout
 
-        if (self.head_dim * self.num_heads) != self.attention_hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.attention_hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.attention_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.attention_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         layer_idx: int,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
 
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim / 2)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.attention_hidden_size)
-
-        attn_output = attn_output
-        attn_output = self.o_proj(attn_output)
-        attn_output = attn_output
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
-# Added softmax_scale = 1 / (query_states.shape[-1]/2)**0.5 to the arguments of self._flash_attention_forward
-# dropped use_sliding_windows from the arguments of self._flash_attention_forward
-class ZambaFlashAttention2(ZambaAttention):
-    """
-    Zamba flash attention module. This module inherits from `ZambaAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        layer_idx: int,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ):
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
             else:
-                target_dtype = self.q_proj.weight.dtype
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        softmax_scale = 1 / math.sqrt(self.head_dim / 2)
-
-        attn_output = _flash_attention_forward(
+        attn_output, attn_weights = attention_interface(
+            self,
             query_states,
             key_states,
             value_states,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            softmax_scale=softmax_scale,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.attention_hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
-# added scale = 1 / (query_states.shape[-1]/2)**0.5 to the arguments of torch.nn.functional.scaled_dot_product_attention
-class ZambaSdpaAttention(ZambaAttention):
-    """
-    Zamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `ZambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        layer_idx: int,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "ZambaModel is using ZambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        softmax_scale = 1 / math.sqrt(self.head_dim / 2)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-            scale=softmax_scale,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
         )
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.attention_hidden_size)
-
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-ZAMBA_ATTENTION_CLASSES = {
-    "eager": ZambaAttention,
-    "flash_attention_2": ZambaFlashAttention2,
-    "sdpa": ZambaSdpaAttention,
-}
+        return attn_output, attn_weights
 
 
 class ZambaMambaMixer(nn.Module):
@@ -568,7 +387,7 @@ def __init__(self, config: ZambaConfig, layer_idx):
             )
 
     def cuda_kernels_forward(
-        self, hidden_states: torch.Tensor, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None
+        self, hidden_states: torch.Tensor, cache_params: ZambaHybridDynamicCache = None, attention_mask=None
     ):
         batch_size, seq_len, _ = hidden_states.shape
         use_precomputed_states = cache_params is not None and cache_params.has_previous_state and seq_len == 1
@@ -664,7 +483,7 @@ def cuda_kernels_forward(
         contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
         return contextualized_states
 
-    def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None):
+    def slow_forward(self, input_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
         batch_size, seq_len, _ = input_states.shape
         dtype = input_states.dtype
         # 1. Gated linear projection
@@ -675,7 +494,7 @@ def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCa
         gate = gate.squeeze(2)
         gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
 
-        use_cache = isinstance(cache_params, HybridMambaAttentionDynamicCache)
+        use_cache = isinstance(cache_params, ZambaHybridDynamicCache)
         # 2. Convolution sequence transformation
         if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
             if self.training:
@@ -757,7 +576,7 @@ def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCa
         )
         return contextualized_states
 
-    def forward(self, hidden_states, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None):
+    def forward(self, hidden_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
         if self.use_fast_kernels:
             if not is_fast_path_available or "cuda" not in self.x_proj_weight.device.type:
                 raise ValueError(
@@ -789,7 +608,7 @@ def forward(self, x):
 class ZambaAttentionDecoderLayer(nn.Module):
     def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
         super().__init__()
-        self.self_attn = ZAMBA_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = ZambaAttention(config, layer_idx)
 
         self.feed_forward = ZambaMLP(config)
         self.input_layernorm = ZambaRMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
@@ -802,11 +621,11 @@ def forward(
         layer_idx: int,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -815,9 +634,11 @@ def forward(
                 This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                 concatenated tensor is then used as input of the pre-attention RMSNorm
                 (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
-            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            position_ids (`torch.LongTensor`, *optional*): token positions of shape `(batch, seq_len)`. Used for positional encodings.
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -829,7 +650,7 @@ def forward(
         """
         hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             layer_idx=layer_idx,
             attention_mask=attention_mask,
@@ -849,9 +670,6 @@ def forward(
         if output_attentions:
             outputs += (self_attn_weights,)
 
-        if use_cache:
-            outputs += (present_key_value,)
-
         return outputs
 
 
@@ -870,7 +688,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
@@ -881,7 +699,7 @@ def forward(
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
-            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -923,7 +741,7 @@ def forward(
         return outputs
 
 
-class HybridLayer(nn.Module):
+class ZambaHybridLayer(nn.Module):
     def __init__(self, shared_transf: ZambaAttentionDecoderLayer, linear: nn.Linear, mamba: ZambaMambaDecoderLayer):
         super().__init__()
         self.shared_transf = shared_transf
@@ -938,7 +756,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
@@ -951,7 +769,7 @@ def forward(
             layer_idx (`int`): layer number.
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
-            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1027,7 +845,7 @@ class ZambaPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = False
     _supports_sdpa = False
-    _supports_cache_class = True  # Note: only supports HybridMambaAttentionDynamicCache
+    _supports_cache_class = True  # Note: only supports ZambaHybridDynamicCache
     _is_stateful = True
 
     def _init_weights(self, module):
@@ -1121,14 +939,14 @@ def _check_and_enable_flash_attn_2(
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the
+        past_key_values (`ZambaHybridDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            A ZambaHybridDynamicCache object containing pre-computed hidden-states (keys and values in the
             self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
             `past_key_values` input) to speed up sequential decoding.
             Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
             Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
             `(batch_size, d_inner, d_state)` respectively.
-            See the `HybridMambaAttentionDynamicCache` class for more details.
+            See the `ZambaHybridDynamicCache` class for more details.
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
@@ -1202,7 +1020,7 @@ def __init__(self, config: ZambaConfig):
                     "shared_transf.pre_ff_layernorm.weight",
                 ]
                 self._tied_weights_keys = [*self._tied_weights_keys, *[prefix_name + key for key in tied_keys]]
-                layers.append(HybridLayer(block, next(linear_layers), next(mamba_layers)))
+                layers.append(ZambaHybridLayer(block, next(linear_layers), next(mamba_layers)))
             else:
                 layers.append(next(mamba_layers))
         self.layers = nn.ModuleList(layers)
@@ -1226,7 +1044,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1263,7 +1081,7 @@ def forward(
 
         if use_cache and past_key_values is None:
             logger.warning_once(
-                "Zamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was "
+                "Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was "
                 "provided, so no cache will be returned."
             )
 
@@ -1324,17 +1142,13 @@ def forward(
         if past_key_values and not past_key_values.has_previous_state:
             past_key_values.has_previous_state = True
 
-        next_cache = None if not use_cache else past_key_values
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-
-        return BaseModelOutputWithPast(
+        output = BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
+            past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
+        return output if return_dict else output.to_tuple()
 
     # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
     def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
@@ -1410,7 +1224,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1504,7 +1318,7 @@ def prepare_inputs_for_generation(
         use_cache=True,
         **kwargs,
     ):
-        # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
+        # Overwitten -- has a unique cache type, `ZambaHybridDynamicCache`
 
         empty_past_kv = past_key_values is None
 
@@ -1518,7 +1332,7 @@ def prepare_inputs_for_generation(
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                 input_ids = input_ids[:, cache_position]
         else:
-            past_key_values = HybridMambaAttentionDynamicCache(
+            past_key_values = ZambaHybridDynamicCache(
                 self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
             )
 
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index de5f6795f0c8b6..f0457d00d937d9 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -404,7 +404,7 @@ def preprocess(
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if do_rescale and is_scaled_image(images[0]):
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 07156b3cf1dbe2..257f5689b0ed71 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -494,7 +494,7 @@ def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> st
         raise RuntimeError(
             f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
         )
-    if getattr(info, "library_name", "transformers") != "transformers":
+    if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}:
         raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
     task = info.pipeline_tag
     return task
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 089c32d502d121..f6acbb3096e07d 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -92,7 +92,7 @@ class AudioClassificationPipeline(Pipeline):
 
     def __init__(self, *args, **kwargs):
         # Default, might be overriden by the model.config.
-        kwargs["top_k"] = 5
+        kwargs["top_k"] = kwargs.get("top_k", 5)
         super().__init__(*args, **kwargs)
 
         if self.framework != "pt":
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 09958b5fca195b..66a9c49ea5f351 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -348,6 +348,12 @@ def _sanitize_parameters(
                 raise ValueError("Only Whisper can return language for now.")
             postprocess_params["return_language"] = return_language
 
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
         return preprocess_params, forward_params, postprocess_params
 
     def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index d2d4f198d41847..a24e9c3f697878 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -33,7 +33,7 @@
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..image_processing_utils import BaseImageProcessor
 from ..modelcard import ModelCard
-from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto import AutoConfig, AutoTokenizer
 from ..processing_utils import ProcessorMixin
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import (
@@ -425,6 +425,62 @@ def get_default_model_and_revision(
     return default_models[framework]
 
 
+def load_assistant_model(
+    model: "PreTrainedModel",
+    assistant_model: Optional[Union[str, "PreTrainedModel"]],
+    assistant_tokenizer: Optional[PreTrainedTokenizer],
+) -> Tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]:
+    """
+    Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The main model that will be used by the pipeline to make predictions.
+        assistant_model (`str` or [`PreTrainedModel`], *optional*):
+            The assistant model that will be used by the pipeline to make predictions.
+        assistant_tokenizer ([`PreTrainedTokenizer`], *optional*):
+            The assistant tokenizer that will be used by the pipeline to encode data for the model.
+
+    Returns:
+        Tuple: The loaded assistant model and (optionally) the loaded tokenizer.
+    """
+    if not model.can_generate() or assistant_model is None:
+        return None, None
+
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError(
+            "Assisted generation, triggered by the `assistant_model` argument, is only available for "
+            "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported."
+        )
+
+    # If the model is passed as a string, load the model and the corresponding tokenizer
+    if isinstance(assistant_model, str):
+        assistant_config = AutoConfig.from_pretrained(assistant_model)
+        _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config)
+        loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype)
+        loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model)
+    else:
+        loaded_assistant_model = assistant_model
+        loaded_assistant_tokenizer = assistant_tokenizer
+
+    # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant
+    # tokenizer
+    same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size
+    same_special_tokens = all(
+        getattr(model.config, token) == getattr(loaded_assistant_model.config, token)
+        for token in ("eos_token_id", "pad_token_id", "bos_token_id")
+    )
+    if same_vocab_size and same_special_tokens:
+        loaded_assistant_tokenizer = None
+    elif loaded_assistant_tokenizer is None:
+        raise ValueError(
+            "The assistant model has a different tokenizer than the main model. You should pass the assistant "
+            "tokenizer."
+        )
+
+    return loaded_assistant_model, loaded_assistant_tokenizer
+
+
 class PipelineException(Exception):
     """
     Raised by a [`Pipeline`] when handling __call__.
@@ -925,8 +981,13 @@ def __init__(
         ):
             self.model.to(self.device)
 
-        # If the model can generate, create a local generation config. This is done to avoid side-effects on the model
-        # as we apply local tweaks to the generation config.
+        # If the model can generate:
+        # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local
+        # tweaks to the generation config.
+        # 2 - load the assistant model if it is passed.
+        self.assistant_model, self.assistant_tokenizer = load_assistant_model(
+            self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None)
+        )
         if self.model.can_generate():
             self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
             self.generation_config = copy.deepcopy(self.model.generation_config)
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 9198f432263822..c176d841e29fa6 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -189,7 +189,14 @@ def _sanitize_parameters(
         if handle_impossible_answer is not None:
             postprocess_params["handle_impossible_answer"] = handle_impossible_answer
 
-        return preprocess_params, {}, postprocess_params
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
 
     def __call__(
         self,
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index afd67b6ac9edee..32a3ec218dac30 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -92,6 +92,12 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt
                 )
             forward_params.update(generate_kwargs)
 
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
         return preprocess_params, forward_params, {}
 
     def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 77c95432c7218f..10ea7170fed40c 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -358,6 +358,13 @@ def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, *
         forward_params = {}
         if sequential is not None:
             forward_params["sequential"] = sequential
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
         return preprocess_params, forward_params, {}
 
     def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None):
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 75ded8ac085ca5..9bc7544550286e 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -106,6 +106,12 @@ def _sanitize_parameters(
                 )
             generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
 
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
         return preprocess_params, forward_params, postprocess_params
 
     def check_inputs(self, input_length: int, min_length: int, max_length: int):
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index e00d980ca1ddfb..c0f14663ffdf58 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -1,5 +1,6 @@
 import enum
-import warnings
+import itertools
+import types
 from typing import Dict
 
 from ..utils import add_end_docstrings, is_tf_available, is_torch_available
@@ -192,12 +193,13 @@ def _sanitize_parameters(
 
         if stop_sequence is not None:
             stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
-            if len(stop_sequence_ids) > 1:
-                warnings.warn(
-                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
-                    " the stop sequence will be used as the stop sequence string in the interim."
-                )
-            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+            generate_kwargs["eos_token_id"] = stop_sequence_ids
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
 
         return preprocess_params, forward_params, postprocess_params
 
@@ -260,16 +262,27 @@ def __call__(self, text_inputs, **kwargs):
               ids of the generated text.
         """
         if isinstance(
-            text_inputs, (list, tuple, KeyDataset) if is_torch_available() else (list, tuple)
-        ) and isinstance(text_inputs[0], (list, tuple, dict)):
-            # We have one or more prompts in list-of-dicts format, so this is chat mode
-            if isinstance(text_inputs[0], dict):
-                return super().__call__(Chat(text_inputs), **kwargs)
+            text_inputs,
+            (list, tuple, types.GeneratorType, KeyDataset)
+            if is_torch_available()
+            else (list, tuple, types.GeneratorType),
+        ):
+            if isinstance(text_inputs, types.GeneratorType):
+                text_inputs, _ = itertools.tee(text_inputs)
+                text_inputs, first_item = (x for x in text_inputs), next(_)
             else:
-                chats = [Chat(chat) for chat in text_inputs]  # 🐈 🐈 🐈
-                return super().__call__(chats, **kwargs)
-        else:
-            return super().__call__(text_inputs, **kwargs)
+                first_item = text_inputs[0]
+            if isinstance(first_item, (list, tuple, dict)):
+                # We have one or more prompts in list-of-dicts format, so this is chat mode
+                if isinstance(first_item, dict):
+                    return super().__call__(Chat(text_inputs), **kwargs)
+                else:
+                    chats = (Chat(chat) for chat in text_inputs)  # 🐈 🐈 🐈
+                    if isinstance(text_inputs, types.GeneratorType):
+                        return super().__call__(chats, **kwargs)
+                    else:
+                        return super().__call__(list(chats), **kwargs)
+        return super().__call__(text_inputs, **kwargs)
 
     def preprocess(
         self,
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
index d17d18205920b0..b7beca586d2195 100644
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -148,10 +148,9 @@ def _forward(self, model_inputs, **kwargs):
         else:
             if len(generate_kwargs):
                 raise ValueError(
-                    f"""You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty.
-                                 For forward-only TTA models, please use `forward_params` instead of of
-                                 `generate_kwargs`. For reference, here are the `generate_kwargs` used here:
-                                 {generate_kwargs.keys()}"""
+                    "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non "
+                    "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. "
+                    f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}"
                 )
             output = self.model(**model_inputs, **forward_params)[0]
 
@@ -191,6 +190,12 @@ def _sanitize_parameters(
         forward_params=None,
         generate_kwargs=None,
     ):
+        if self.assistant_model is not None:
+            generate_kwargs["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            generate_kwargs["tokenizer"] = self.tokenizer
+            generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer
+
         params = {
             "forward_params": forward_params if forward_params else {},
             "generate_kwargs": generate_kwargs if generate_kwargs else {},
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 89988c0cba2b1b..6d600c9eaf50bc 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -66,7 +66,15 @@ def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeou
             preprocess_params["timeout"] = timeout
         if top_k is not None:
             postprocess_params["top_k"] = top_k
-        return preprocess_params, {}, postprocess_params
+
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
 
     def __call__(
         self,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index c2646300367033..611e2aa3f20cd0 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -30,7 +30,7 @@
 import typing_extensions
 
 from .dynamic_module_utils import custom_object_save
-from .image_utils import ChannelDimension, is_valid_image, is_vision_available
+from .image_utils import ChannelDimension, is_valid_image, is_vision_available, load_image, load_video
 
 
 if is_vision_available():
@@ -336,6 +336,64 @@ class CustomProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+class ChatTemplateKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for processor chat templates.
+
+    tokenize (`bool`, *optional*, defaults to `False`):
+        Whether to tokenize the output or not.
+    return_dict (`bool`, defaults to `False`):
+        Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+    tools (`List[Dict]`, *optional*):
+        A list of tools (callable functions) that will be accessible to the model. If the template does not
+        support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
+        giving the name, description and argument types for the tool. See our
+        [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
+        for more information.
+    documents (`List[Dict[str, str]]`, *optional*):
+        A list of dicts representing documents that will be accessible to the model if it is performing RAG
+        (retrieval-augmented generation). If the template does not support RAG, this argument will have no
+        effect. We recommend that each document should be a dict containing "title" and "text" keys. Please
+        see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG)
+        for examples of passing documents with chat templates.
+    add_generation_prompt (bool, *optional*):
+        If this is set, a prompt with the token(s) that indicate
+        the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
+        Note that this argument will be passed to the chat template, and so it must be supported in the
+        template for this argument to have any effect.
+    continue_final_message (bool, *optional*):
+        If this is set, the chat will be formatted so that the final
+        message in the chat is open-ended, without any EOS tokens. The model will continue this message
+        rather than starting a new one. This allows you to "prefill" part of
+        the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
+    return_assistant_tokens_mask (`bool`, defaults to `False`):
+        Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
+        the mask will contain 1. For user and system tokens, the mask will contain 0.
+        This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
+    num_frames (`int`, *optional*):
+        Number of frames to sample uniformly. If not passed, the whole video is loaded.
+    video_load_backend (`str`, *optional*, defaults to `"pyav"`):
+        The backend to use when loading the video which will be used only when there are videos in the conversation.
+        Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
+        that supports all types of sources to load from.
+    """
+
+    tokenize: Optional[bool] = False
+    return_dict: Optional[bool] = False
+    tools: Optional[List[Dict]] = None
+    documents: Optional[List[Dict[str, str]]] = None
+    add_generation_prompt: Optional[bool] = False
+    continue_final_message: Optional[bool] = False
+    return_assistant_tokens_mask: Optional[bool] = False
+    num_frames: Optional[int] = None
+    video_load_backend: Optional[str] = "pyav"
+
+
+class AllKwargsForChatTemplate(
+    TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ChatTemplateKwargs
+): ...
+
+
 class ProcessorMixin(PushToHubMixin):
     """
     This is a mixin used to provide saving/loading functionality for all processor classes.
@@ -1100,23 +1158,32 @@ def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
         chat_template: Optional[str] = None,
-        tokenize: bool = False,
-        **kwargs,
+        **kwargs: Unpack[AllKwargsForChatTemplate],
     ) -> str:
         """
         Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
         conversations to turn them into a single tokenizable string.
 
+        The input is expected to be in the following format, where each message content is a list consisting of text and
+        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
+        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Please describe this image in detail."},
+                ],
+            },
+        ]
+
         Args:
             conversation (`List[Dict, str, str]`):
                 The conversation to format.
             chat_template (`Optional[str]`, *optional*):
                 The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
                 chat template is used.
-            tokenize (`bool`, *optional*, defaults to `False`):
-                Whether to tokenize the output or not.
-            **kwargs:
-                Additional keyword arguments
         """
 
         if chat_template is None:
@@ -1128,10 +1195,62 @@ def apply_chat_template(
                     "or provide a chat template as an argument. See "
                     "https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
                 )
-        return self.tokenizer.apply_chat_template(
-            conversation, chat_template=chat_template, tokenize=tokenize, **kwargs
+
+        text_kwargs = {}
+        for key in TextKwargs.__annotations__.keys():
+            value = kwargs.pop(key, None)
+            if value is not None:
+                text_kwargs[key] = value
+
+        chat_template_kwargs = {}
+        for key in ChatTemplateKwargs.__annotations__.keys():
+            value = kwargs.pop(key, getattr(ChatTemplateKwargs, key))
+            chat_template_kwargs[key] = value
+
+        # Pop kwargs that should not be used by tokenizer's `apply_chat_template`
+        tokenize = chat_template_kwargs.pop("tokenize")
+        return_dict = chat_template_kwargs.pop("return_dict")
+        num_frames = chat_template_kwargs.pop("num_frames")
+        video_load_backend = chat_template_kwargs.pop("video_load_backend")
+
+        prompt = self.tokenizer.apply_chat_template(
+            conversation,
+            chat_template=chat_template,
+            tokenize=False,
+            return_dict=False,
+            **text_kwargs,
+            **chat_template_kwargs,
         )
 
+        # we will have to return all processed inputs in a dict
+        if tokenize:
+            images, videos = [], []
+            for message in conversation:
+                visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                for vision_info in visuals:
+                    if vision_info["type"] == "image":
+                        for key in ["image", "url", "path", "base64"]:
+                            if key in vision_info:
+                                images.append(load_image(vision_info[key]))
+                    elif vision_info["type"] == "video":
+                        for key in ["video", "url", "path"]:
+                            if key in vision_info:
+                                videos.append(
+                                    load_video(vision_info[key], num_frames=num_frames, backend=video_load_backend)
+                                )
+
+            out = self(
+                text=prompt,
+                images=images if images else None,
+                videos=videos if videos else None,
+                **kwargs,
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        return prompt
+
     def post_process_image_text_to_text(self, generated_outputs):
         """
         Post-process the output of a vlm to decode the text.
diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py
index d47a2ba79cb60d..8c84c8817a442a 100644
--- a/src/transformers/quantizers/quantizer_gptq.py
+++ b/src/transformers/quantizers/quantizer_gptq.py
@@ -22,7 +22,7 @@
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
 
-from ..utils import is_auto_gptq_available, is_optimum_available, is_torch_available, logging
+from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging
 from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
 
 
@@ -35,11 +35,11 @@
 class GptqHfQuantizer(HfQuantizer):
     """
     Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
-    `auto_gptq` package. Quantization is done under the hood for users if they load a non-prequantized model.
+    `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model.
     """
 
     requires_calibration = False
-    required_packages = ["optimum", "auto_gptq"]
+    required_packages = ["optimum", "auto_gptq", "gptqmodel"]
     optimum_quantizer = None
 
     def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
@@ -54,19 +54,30 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
     def validate_environment(self, *args, **kwargs):
         if not is_optimum_available():
             raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+        if is_auto_gptq_available() and is_gptqmodel_available():
+            logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel")
 
-        if not is_auto_gptq_available():
-            raise ImportError(
-                "Loading a GPTQ quantized model requires the auto-gptq library (`pip install auto-gptq`)"
-            )
-
-        gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        gptq_supports_cpu = (
+            is_auto_gptq_available()
+            and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        ) or is_gptqmodel_available()
         if not gptq_supports_cpu and not torch.cuda.is_available():
             raise RuntimeError("GPU is required to quantize or run quantize model.")
-        elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"):
+        elif not (is_auto_gptq_available() or is_gptqmodel_available()):
             raise ImportError(
-                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`"
+                "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. "
             )
+        elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse(
+            "0.4.2"
+        ):
+            raise ImportError(
+                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`."
+            )
+        elif is_gptqmodel_available() and (
+            version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
+            or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
+        ):
+            raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0")
 
     def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         if torch_dtype is None:
@@ -76,12 +87,20 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
             logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.")
         return torch_dtype
 
+    def update_device_map(self, device_map):
+        if device_map is None:
+            device_map = {"": torch.device("cpu")}
+        # Only with auto-gptq do not support CPU, we should move the model to cuda if available.
+        if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}):
+            device_map == {"": 0}
+        return device_map
+
     def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
         if model.__class__.main_input_name != "input_ids":
             raise RuntimeError("We can only quantize pure text model.")
 
         if self.pre_quantized:
-            model = self.optimum_quantizer.convert_model(model)
+            model = self.optimum_quantizer.convert_model(model, **kwargs)
 
     def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
         if self.pre_quantized:
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 6890a53688943b..5bbf2771380556 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -85,8 +85,10 @@
     is_g2p_en_available,
     is_galore_torch_available,
     is_gguf_available,
+    is_gptqmodel_available,
     is_grokadamw_available,
     is_hadamard_available,
+    is_hqq_available,
     is_ipex_available,
     is_jieba_available,
     is_jinja_available,
@@ -1206,11 +1208,20 @@ def require_tensorboard(test_case):
     return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
 
 
-def require_auto_gptq(test_case):
+def require_gptq(test_case):
     """
     Decorator for auto_gptq dependency
     """
-    return unittest.skipUnless(is_auto_gptq_available(), "test requires auto-gptq")(test_case)
+    return unittest.skipUnless(
+        is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq"
+    )(test_case)
+
+
+def require_hqq(test_case):
+    """
+    Decorator for hqq dependency
+    """
+    return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case)
 
 
 def require_auto_awq(test_case):
@@ -1409,17 +1420,48 @@ def assert_screenout(out, what):
 
 
 def set_model_tester_for_less_flaky_test(test_case):
-    if hasattr(test_case.model_tester, "num_hidden_layers"):
-        test_case.model_tester.num_hidden_layers = 1
+    target_num_hidden_layers = 1
+    # TODO (if possible): Avoid exceptional cases
+    exceptional_classes = [
+        "ZambaModelTester",
+        "RwkvModelTester",
+        "AriaVisionText2TextModelTester",
+        "GPTNeoModelTester",
+        "DPTModelTester",
+    ]
+    if test_case.model_tester.__class__.__name__ in exceptional_classes:
+        target_num_hidden_layers = None
+    if hasattr(test_case.model_tester, "out_features") or hasattr(test_case.model_tester, "out_indices"):
+        target_num_hidden_layers = None
+
+    if hasattr(test_case.model_tester, "num_hidden_layers") and target_num_hidden_layers is not None:
+        test_case.model_tester.num_hidden_layers = target_num_hidden_layers
     if (
         hasattr(test_case.model_tester, "vision_config")
         and "num_hidden_layers" in test_case.model_tester.vision_config
+        and target_num_hidden_layers is not None
     ):
         test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config)
-        test_case.model_tester.vision_config["num_hidden_layers"] = 1
-    if hasattr(test_case.model_tester, "text_config") and "num_hidden_layers" in test_case.model_tester.text_config:
+        if isinstance(test_case.model_tester.vision_config, dict):
+            test_case.model_tester.vision_config["num_hidden_layers"] = 1
+        else:
+            test_case.model_tester.vision_config.num_hidden_layers = 1
+    if (
+        hasattr(test_case.model_tester, "text_config")
+        and "num_hidden_layers" in test_case.model_tester.text_config
+        and target_num_hidden_layers is not None
+    ):
         test_case.model_tester.text_config = copy.deepcopy(test_case.model_tester.text_config)
-        test_case.model_tester.text_config["num_hidden_layers"] = 1
+        if isinstance(test_case.model_tester.text_config, dict):
+            test_case.model_tester.text_config["num_hidden_layers"] = 1
+        else:
+            test_case.model_tester.text_config.num_hidden_layers = 1
+
+    # A few model class specific handling
+
+    # For Albert
+    if hasattr(test_case.model_tester, "num_hidden_groups"):
+        test_case.model_tester.num_hidden_groups = test_case.model_tester.num_hidden_layers
 
 
 def set_config_for_less_flaky_test(config):
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index de0bc87b26b676..b9b24d681b2417 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2292,13 +2292,6 @@ def _from_pretrained(
                 "Unable to load vocabulary from file. "
                 "Please check that the provided vocabulary is accessible and not corrupted."
             )
-        except RuntimeError as e:
-            if "sentencepiece_processor.cc" in str(e):
-                logger.info(
-                    "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
-                    "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
-                )
-                return False
 
         if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
             logger.info(
@@ -2429,6 +2422,7 @@ def save_pretrained(
             tokenizer_config["extra_special_tokens"] = self.extra_special_tokens
             tokenizer_config.update(self.extra_special_tokens)
 
+        saved_raw_chat_template = False
         if self.chat_template is not None:
             if isinstance(self.chat_template, dict):
                 # Chat template dicts are saved to the config as lists of dicts with fixed key names.
@@ -2439,6 +2433,7 @@ def save_pretrained(
             elif kwargs.get("save_raw_chat_template", False):
                 with open(chat_template_file, "w", encoding="utf-8") as f:
                     f.write(self.chat_template)
+                saved_raw_chat_template = True
                 logger.info(f"chat template saved in {chat_template_file}")
                 if "chat_template" in tokenizer_config:
                     tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
@@ -2498,6 +2493,8 @@ def save_pretrained(
         logger.info(f"Special tokens file saved in {special_tokens_map_file}")
 
         file_names = (tokenizer_config_file, special_tokens_map_file)
+        if saved_raw_chat_template:
+            file_names += (chat_template_file,)
 
         save_files = self._save_pretrained(
             save_directory=save_directory,
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index d1353adfd22589..925069f2c2f957 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -102,6 +102,7 @@ def __init__(self, *args, **kwargs):
         fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
         from_slow = kwargs.pop("from_slow", False)
         added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
+        self.add_prefix_space = kwargs.get("add_prefix_space", False)
 
         if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
             raise ValueError(
@@ -206,6 +207,18 @@ def __init__(self, *args, **kwargs):
             if tokens:
                 self.add_tokens(tokens)
 
+        try:
+            pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+            if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space:
+                pre_tok_class = getattr(pre_tokenizers_fast, pre_tok_state.pop("type"))
+                pre_tok_state["add_prefix_space"] = self.add_prefix_space
+                self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+        except Exception:
+            # We'll get an error if there is no pre_tokenizer, or if it's a custom pre_tokenizer that can
+            # not be serialized. In those cases, we just ignore the error as there's no pre_tokenizer
+            # for which we need to update the `add_prefix_space` attribute.
+            pass
+
     @property
     def is_fast(self) -> bool:
         return True
@@ -813,17 +826,17 @@ def train_new_from_iterator(
             kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
         if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
             kwargs["unk_token"] = unk_token
-        if (
-            tokenizer_json["pre_tokenizer"] is not None
-            and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
-            or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
-            and "pretokenizers" in tokenizer_json["pre_tokenizer"]
-            and any(
-                pretokenizer["type"] == "ByteLevel"
-                for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
-            )
-        ):
-            kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
+        if tokenizer_json["pre_tokenizer"] is not None:
+            if (
+                tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
+                or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
+                and "pretokenizers" in tokenizer_json["pre_tokenizer"]
+                and any(
+                    pretokenizer["type"] == "ByteLevel"
+                    for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
+                )
+            ):
+                kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
 
         trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
         trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 655d5b260c1f36..4cd9a6b4dfe7a4 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3156,7 +3156,6 @@ def _load_rng_state(self, checkpoint):
     def _determine_best_metric(self, metrics, trial):
         """
         Determine if the model should be saved based on the evaluation metrics.
-        If args.metric_for_best_model is not set, the loss is used.
 
         Returns:
             bool: True if a new best metric was found, else False
@@ -3673,7 +3672,10 @@ def training_step(
             return loss_mb.reduce_mean().detach().to(self.args.device)
 
         with self.compute_loss_context_manager():
-            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+            if self.model_accepts_loss_kwargs:
+                loss = self.compute_loss(model, inputs)
+            else:
+                loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
 
         del inputs
         if (
@@ -5155,6 +5157,10 @@ def get_batch_samples(self, epoch_iterator, num_batches):
             except StopIteration:
                 break
 
+        # Keep default behavior the same
+        if not self.model_accepts_loss_kwargs:
+            return batch_samples, None
+
         if len(batch_samples) > 0 and "labels" in batch_samples[0]:
             # For now we don't support object detection
             try:
@@ -5162,7 +5168,7 @@ def get_batch_samples(self, epoch_iterator, num_batches):
             except (TypeError, AttributeError):
                 pass
 
-        if self.args.average_tokens_across_devices:
+        if self.args.average_tokens_across_devices and num_items_in_batch is not None:
             num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item()
 
         if torch.is_tensor(num_items_in_batch):
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 7b711f65701d44..8f241a9db4a370 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -707,10 +707,14 @@ def check_metric_value(self, args, state, control, metric_value):
             self.early_stopping_patience_counter += 1
 
     def on_train_begin(self, args, state, control, **kwargs):
-        assert args.load_best_model_at_end, "EarlyStoppingCallback requires load_best_model_at_end = True"
+        if not args.load_best_model_at_end:
+            logger.warning(
+                "Using EarlyStoppingCallback without load_best_model_at_end=True. "
+                "Once training is finished, the best model will not be loaded automatically."
+            )
         assert (
             args.metric_for_best_model is not None
-        ), "EarlyStoppingCallback requires metric_for_best_model is defined"
+        ), "EarlyStoppingCallback requires metric_for_best_model to be defined"
         assert (
             args.eval_strategy != IntervalStrategy.NO
         ), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 5cd89b3701cff6..76b7c1556d8a47 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -372,10 +372,12 @@ def prediction_step(
         return loss, generated_tokens, labels
 
     def _pad_tensors_to_max_len(self, tensor, max_length):
-        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+        if self.processing_class is not None and hasattr(self.processing_class, "pad_token_id"):
             # If PAD token is not defined at least EOS token has to be defined
             pad_token_id = (
-                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+                self.processing_class.pad_token_id
+                if self.processing_class.pad_token_id is not None
+                else self.processing_class.eos_token_id
             )
         else:
             if self.model.config.pad_token_id is not None:
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a1b5b511a95e35..a7b2ba0db3a79e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -476,11 +476,13 @@ class TrainingArguments:
 
         metric_for_best_model (`str`, *optional*):
             Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
-            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
-            default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
+            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`.
 
-            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
-            your metric is better when lower.
+            If not specified, this will default to `"loss"` when either `load_best_model_at_end == True`
+            or `lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU` (to use the evaluation loss).
+
+            If you set this value, `greater_is_better` will default to `True` unless the name ends with "loss".
+            Don't forget to set it to `False` if your metric is better when lower.
         greater_is_better (`bool`, *optional*):
             Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models
             should have a greater metric or not. Will default to:
@@ -1636,7 +1638,7 @@ def __post_init__(self):
             self.save_steps = int(self.save_steps)
 
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
-        if self.load_best_model_at_end:
+        if self.load_best_model_at_end and self.save_strategy != SaveStrategy.BEST:
             if self.eval_strategy != self.save_strategy:
                 raise ValueError(
                     "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
diff --git a/src/transformers/training_args_seq2seq.py b/src/transformers/training_args_seq2seq.py
index 88ae662570abef..5342b7add3932c 100644
--- a/src/transformers/training_args_seq2seq.py
+++ b/src/transformers/training_args_seq2seq.py
@@ -30,12 +30,6 @@
 class Seq2SeqTrainingArguments(TrainingArguments):
     """
     Args:
-        sortish_sampler (`bool`, *optional*, defaults to `False`):
-            Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
-            for now but will become generally available in the near future.
-
-            It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
-            for the training set.
         predict_with_generate (`bool`, *optional*, defaults to `False`):
             Whether to use generate to calculate generative metrics (ROUGE, BLEU).
         generation_max_length (`int`, *optional*):
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 641899eb278542..919f9f8bde031a 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -131,6 +131,7 @@
     is_cv2_available,
     is_cython_available,
     is_datasets_available,
+    is_decord_available,
     is_detectron2_available,
     is_eetq_available,
     is_essentia_available,
@@ -146,6 +147,7 @@
     is_g2p_en_available,
     is_galore_torch_available,
     is_gguf_available,
+    is_gptqmodel_available,
     is_grokadamw_available,
     is_hadamard_available,
     is_hqq_available,
@@ -236,6 +238,7 @@
     is_uroman_available,
     is_vision_available,
     is_vptq_available,
+    is_yt_dlp_available,
     requires_backends,
     torch_only_method,
 )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 922d67264bb142..bac6220a718a01 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3579,6 +3579,48 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DiffLlamaForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DiffLlamaForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DiffLlamaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DiffLlamaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DiffLlamaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DiffLlamaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class DinatBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -3891,6 +3933,41 @@ def load_tf_weights_in_electra(*args, **kwargs):
     requires_backends(load_tf_weights_in_electra, ["torch"])
 
 
+class Emu3ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Emu3ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Emu3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Emu3TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Emu3VQVAE(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class EncodecModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4904,6 +4981,41 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class HeliumForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HeliumForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HeliumForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HeliumModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HeliumPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class HieraBackbone(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6481,6 +6593,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MoonshineForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MoonshineModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MoonshinePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MoshiForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9116,6 +9249,34 @@ def load_tf_weights_in_tapas(*args, **kwargs):
     requires_backends(load_tf_weights_in_tapas, ["torch"])
 
 
+class TextNetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextNetForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextNetPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class TimeSeriesTransformerForPrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9683,6 +9844,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class VitPoseForPoseEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitPosePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitPoseBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VitPoseBackbonePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class VitsModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 3ebda4404aae9c..2fcc7f172054f9 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -233,6 +233,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Emu3ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class FlavaFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -618,6 +625,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class TextNetImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class TvpImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -688,6 +702,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class VitPoseImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VivitImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 5d18c5568a5f1f..802d0ca68974a0 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -101,6 +101,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _aqlm_available = _is_package_available("aqlm")
 _vptq_available, _vptq_version = _is_package_available("vptq", return_version=True)
 _av_available = importlib.util.find_spec("av") is not None
+_decord_available = importlib.util.find_spec("decord") is not None
 _bitsandbytes_available = _is_package_available("bitsandbytes")
 _eetq_available = _is_package_available("eetq")
 _fbgemm_gpu_available = _is_package_available("fbgemm_gpu")
@@ -113,6 +114,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _coloredlogs_available = _is_package_available("coloredlogs")
 # `importlib.metadata.util` doesn't work with `opencv-python-headless`.
 _cv2_available = importlib.util.find_spec("cv2") is not None
+_yt_dlp_available = importlib.util.find_spec("yt_dlp") is not None
 _datasets_available = _is_package_available("datasets")
 _detectron2_available = _is_package_available("detectron2")
 # We need to check both `faiss` and `faiss-cpu`.
@@ -142,6 +144,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _openai_available = _is_package_available("openai")
 _optimum_available = _is_package_available("optimum")
 _auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
 _quanto_available = _is_package_available("quanto")
@@ -313,6 +316,10 @@ def is_cv2_available():
     return _cv2_available
 
 
+def is_yt_dlp_available():
+    return _yt_dlp_available
+
+
 def is_torch_available():
     return _torch_available
 
@@ -361,6 +368,9 @@ def is_torch_sdpa_available():
     # NOTE: MLU is OK with non-contiguous inputs.
     if is_torch_mlu_available():
         return version.parse(_torch_version) >= version.parse("2.1.0")
+    # NOTE: NPU can use SDPA in Transformers with torch>=2.1.0.
+    if is_torch_npu_available():
+        return version.parse(_torch_version) >= version.parse("2.1.0")
     # NOTE: We require torch>=2.1.1 to avoid a numerical issue in SDPA with non-contiguous inputs: https://github.com/pytorch/pytorch/issues/112577
     return version.parse(_torch_version) >= version.parse("2.1.1")
 
@@ -838,6 +848,10 @@ def is_av_available():
     return _av_available
 
 
+def is_decord_available():
+    return _decord_available
+
+
 def is_ninja_available():
     r"""
     Code comes from *torch.utils.cpp_extension.is_ninja_available()*. Returns `True` if the
@@ -1028,6 +1042,10 @@ def is_auto_gptq_available():
     return _auto_gptq_available
 
 
+def is_gptqmodel_available():
+    return _gptqmodel_available
+
+
 def is_eetq_available():
     return _eetq_available
 
@@ -1093,8 +1111,7 @@ def is_in_notebook():
         get_ipython = sys.modules["IPython"].get_ipython
         if "IPKernelApp" not in get_ipython().config:
             raise ImportError("console")
-        if "VSCODE_PID" in os.environ:
-            raise ImportError("vscode")
+        # Removed the lines to include VSCode
         if "DATABRICKS_RUNTIME_VERSION" in os.environ and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0":
             # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
             # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
@@ -1274,6 +1291,22 @@ def is_triton_available():
 Please note that you may need to restart your runtime after installation.
 """
 
+# docstyle-ignore
+YT_DLP_IMPORT_ERROR = """
+{0} requires the YT-DLP library but it was not found in your environment. You can install it with:
+```
+pip install yt-dlp
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+DECORD_IMPORT_ERROR = """
+{0} requires the PyAv library but it was not found in your environment. You can install it with:
+```
+pip install decord
+```
+Please note that you may need to restart your runtime after installation.
+"""
 
 # docstyle-ignore
 CV2_IMPORT_ERROR = """
@@ -1614,6 +1647,7 @@ def is_triton_available():
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
         ("cv2", (is_cv2_available, CV2_IMPORT_ERROR)),
         ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
         ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
         ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
         ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
@@ -1652,6 +1686,7 @@ def is_triton_available():
         ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
         ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
         ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
+        ("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 300ff6fde48238..eff8a28459e7ed 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import re
 import time
 from typing import Optional
@@ -116,6 +117,9 @@ def __init__(
         self.output = None
         self.value = None
         self.label = None
+        if "VSCODE_PID" in os.environ:
+            self.update_every = 0.5  # Adjusted for smooth updated as html rending is slow on VS Code
+            # This is the only adjustment required to optimize training html rending
 
     def update(self, value: int, force_update: bool = False, comment: str = None):
         """
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 3160c3481da1d7..b356b3d9b0ab69 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -25,7 +25,15 @@
 
 from packaging import version
 
-from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, is_torchao_available, logging
+from ..utils import (
+    is_auto_awq_available,
+    is_gptqmodel_available,
+    is_hqq_available,
+    is_torch_available,
+    is_torchao_available,
+    logging,
+)
+from .import_utils import is_auto_gptq_available
 
 
 if is_torch_available():
@@ -224,6 +232,10 @@ def __init__(
     ):
         if is_hqq_available():
             from hqq.core.quantize import BaseQuantizeConfig as HQQBaseQuantizeConfig
+        else:
+            raise ImportError(
+                "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`."
+            )
 
         for deprecated_key in ["quant_zero", "quant_scale", "offload_meta"]:
             if deprecated_key in kwargs:
@@ -577,8 +589,16 @@ class GPTQConfig(QuantizationConfigMixin):
             Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
             the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
             quantization using inputs that have passed through the previously quantized layers.
+        checkpoint_format (`str`, *optional*, defaults to `"gptq"`):
+            GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
+        meta (`Dict[str, any]`, *optional*):
+            Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
+            i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
+        backend (`str`, *optional*):
+            Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only
+            valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
         use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
-            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only.
         model_seqlen (`int`, *optional*):
             The maximum sequence length that the model can take.
         block_name_to_quantize (`str`, *optional*):
@@ -618,6 +638,9 @@ def __init__(
         desc_act: bool = False,
         sym: bool = True,
         true_sequential: bool = True,
+        checkpoint_format: str = "gptq",
+        meta: Optional[Dict[str, any]] = None,
+        backend: Optional[str] = None,
         use_cuda_fp16: bool = False,
         model_seqlen: Optional[int] = None,
         block_name_to_quantize: Optional[str] = None,
@@ -640,6 +663,9 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.checkpoint_format = checkpoint_format.lower()
+        self.meta = meta
+        self.backend = backend.lower() if isinstance(backend, str) else backend
         self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
@@ -656,7 +682,14 @@ def __init__(
 
     def get_loading_attributes(self):
         attibutes_dict = copy.deepcopy(self.__dict__)
-        loading_attibutes = ["disable_exllama", "use_exllama", "exllama_config", "use_cuda_fp16", "max_input_length"]
+        loading_attibutes = [
+            "disable_exllama",
+            "use_exllama",
+            "exllama_config",
+            "use_cuda_fp16",
+            "max_input_length",
+            "backend",
+        ]
         loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
         return loading_attibutes_dict
 
@@ -688,6 +721,17 @@ def post_init(self):
                     ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
                 )
 
+        # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial)
+        if is_gptqmodel_available():
+            # convert auto-gptq control into gptqmodel backend
+            if self.backend is None:
+                self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto"
+        else:
+            # convert gptqmodel backend `auto_trainable` into auto-gptq control
+            if self.backend == "auto_trainable":
+                self.use_exllama = False
+
+        # auto-gptq specific kernel control logic
         if self.disable_exllama is None and self.use_exllama is None:
             # New default behaviour
             self.use_exllama = True
@@ -721,12 +765,13 @@ def post_init(self):
                     "speed using exllamav2 kernel by setting `exllama_config`."
                 )
             elif self.exllama_config["version"] == ExllamaVersion.TWO:
-                optimum_version = version.parse(importlib.metadata.version("optimum"))
-                autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
-                if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
-                    raise ValueError(
-                        f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
-                    )
+                if is_auto_gptq_available():
+                    optimum_version = version.parse(importlib.metadata.version("optimum"))
+                    autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
+                    if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
+                        raise ValueError(
+                            f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
+                        )
         if self.modules_in_block_to_quantize is not None:
             optimum_version = version.parse(importlib.metadata.version("optimum"))
             if optimum_version < version.parse("1.15.0"):
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 8eaa00bc768428..95a8036016133d 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -482,6 +482,7 @@ def test_hf_ds_config_mismatch(self):
                 max_grad_norm=max_grad_norm,
                 adam_beta1=adam_beta1,
                 adam_beta2=adam_beta2,
+                output_dir=self.get_auto_remove_tmp_dir(),
             )
             with self.assertRaises(Exception) as context:
                 trainer.train()
@@ -506,7 +507,9 @@ def test_hf_scheduler_hf_optimizer(self):
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -518,7 +521,9 @@ def test_ds_scheduler_hf_optimizer(self):
             del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -530,7 +535,9 @@ def test_hf_scheduler_ds_optimizer(self):
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -546,7 +553,9 @@ def test_stage3_nvme_offload(self):
             ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
             ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
             ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
+            trainer = get_regression_trainer(
+                local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@@ -567,6 +576,7 @@ def model_init():
                 fp16=True,
                 model_init=model_init,
                 deepspeed=ds_config_zero3_dict,
+                output_dir=self.get_auto_remove_tmp_dir(),
             )
 
             n_trials = 3
@@ -588,7 +598,7 @@ def test_hf_optimizer_with_offload(self, stage, dtype):
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         ds_config_dict["zero_force_ds_cpu_optimizer"] = False  # offload is not efficient w/o CPUAdam
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
+            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             with CaptureLogger(deepspeed_logger) as cl:
@@ -604,7 +614,11 @@ def test_fake_notebook_no_launcher(self, stage, dtype):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
+            kwargs = {
+                "local_rank": 0,
+                "deepspeed": self.get_config_dict(stage),
+                "output_dir": self.get_auto_remove_tmp_dir(),
+            }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
@@ -630,6 +644,7 @@ def test_early_get_last_lr(self, stage, dtype):
                 "deepspeed": self.get_config_dict(stage),
                 "per_device_train_batch_size": 8,
                 "logging_steps": 1,
+                "output_dir": self.get_auto_remove_tmp_dir(),
             }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
@@ -673,6 +688,7 @@ def test_gradient_accumulation(self, stage, dtype):
             "local_rank": 0,
             "train_len": train_len,
             "deepspeed": self.get_config_dict(stage),
+            "output_dir": self.get_auto_remove_tmp_dir(),
         }
         kwargs[dtype] = True
 
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index 74a3bfe04b7506..f5af373f49bc0a 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -32,7 +32,6 @@
     require_accelerate,
     require_fsdp,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_multi_accelerator,
     slow,
     torch_device,
@@ -288,7 +287,7 @@ def test_training_and_can_resume_normally(self, state_dict_type):
 
     @require_torch_multi_accelerator
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_fsdp
     def test_fsdp_cpu_offloading(self):
         try:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 4ac22e77779022..7499a5599b7c4b 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -33,6 +33,7 @@
     require_flash_attn,
     require_optimum_quanto,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
@@ -204,6 +205,8 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
                 "vision_start_token_id",
             ]:
                 token_index = getattr(config, key, None)
+                if token_index is None and hasattr(self, "model_tester"):
+                    token_index = getattr(self.model_tester, key, None)
                 if token_index is not None and token_index < config.get_text_config().vocab_size:
                     logits_processor_kwargs["bad_words_ids"].append([token_index])
 
@@ -1077,7 +1080,10 @@ def test_beam_search_low_memory(self):
             ):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
+            set_model_tester_for_less_flaky_test(self)
+
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            set_config_for_less_flaky_test(config)
             # batch_size=1 is ok, but batch_size>1 will cause non-identical output
 
             config.use_cache = True
@@ -1085,6 +1091,9 @@ def test_beam_search_low_memory(self):
 
             # test output equality of low versus high memory
             model = model_class(config).to(torch_device).eval()
+            set_model_for_less_flaky_test(model)
+
+            logits_processor_kwargs = self._get_logits_processor_kwargs(config=model.config)
 
             low_output = model.generate(
                 **inputs_dict,
@@ -1093,6 +1102,10 @@ def test_beam_search_low_memory(self):
                 early_stopping=True,
                 low_memory=True,
                 use_cache=True,
+                output_scores=True,
+                output_logits=True,
+                return_dict_in_generate=True,
+                **logits_processor_kwargs,
             )
 
             high_output = model.generate(
@@ -1102,8 +1115,13 @@ def test_beam_search_low_memory(self):
                 early_stopping=True,
                 low_memory=False,
                 use_cache=True,
+                output_scores=True,
+                output_logits=True,
+                return_dict_in_generate=True,
+                **logits_processor_kwargs,
             )
-            self.assertListEqual(low_output.tolist(), high_output.tolist())
+            # The two outputs must match and their shape must be as expected
+            self._check_similar_generate_outputs(low_output, high_output)
 
     @pytest.mark.generate
     @parameterized.expand([("random",), ("same",)])
@@ -1615,9 +1633,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
 
             # There are a few exception patterns in this test:
             # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed
-            requires_inputs_ids = any(
-                model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"]
-            )
+            requires_inputs_ids = any(model_name in model_class.__name__.lower() for model_name in ["idefics"])
             # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex
             # than calling the embedding layer with `input_ids`. Subcases of this exception:
             #   2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag)
@@ -1628,7 +1644,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
             #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
             pixel_values_is_mutually_exclusive = any(
                 model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"]
+                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -1702,6 +1718,18 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
                 self.skipTest(reason="This model does not support `inputs_embeds` in generation")
 
+            #   Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the
+            #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
+            #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
+            pixel_values_is_mutually_exclusive = any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
+            )
+            if pixel_values_is_mutually_exclusive:
+                inputs_dict.pop("pixel_values", None)
+                inputs_dict.pop("pixel_values_videos", None)
+                inputs_dict.pop("pixel_values_images", None)
+
             input_ids = inputs_dict.pop("input_ids")
 
             model.config.use_cache = True
@@ -1943,6 +1971,10 @@ def test_generate_with_static_cache(self):
 
             for dtype in (torch.float32, torch.float16):
                 model = model_class(config).to(torch_device).to(dtype).eval()
+                inputs_dict = {
+                    k: v.to(dtype) if isinstance(v, torch.Tensor) and torch.is_floating_point(v) else v
+                    for k, v in inputs_dict.items()
+                }
                 set_model_for_less_flaky_test(model)
 
                 generation_kwargs = {
@@ -2011,16 +2043,10 @@ def test_generate_with_quant_cache(self):
             with self.assertRaises(ValueError):
                 model.generate(**generation_kwargs, **inputs_dict)
 
-    @parameterized.expand(
-        [
-            ("forward_only", False),  # TODO (@joao): a few models failing. After fixed, this should not be "@slow"
-            ("end_to_end", True),  # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix
-        ]
-    )
     @pytest.mark.generate
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
-    def test_generate_compile(self, _, end_to_end):
+    def test_generate_compile_model_forward(self):
         """
         Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests
         end-to-end compilation and forward pass compilation only.
@@ -2030,14 +2056,7 @@ def test_generate_compile(self, _, end_to_end):
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache")
 
-            # TODO (joao) -- fix and enable me :)
-            if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
-                self.skipTest("whisper model end-to-end generate compile not yet supported")
-
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # TODO (joao) -- fix and enable me :)
-            if end_to_end and config.is_encoder_decoder:
-                self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported")
 
             model = model_class(config).to(torch_device)
             model.eval()  # otherwise `self.training` is `True` -- this flag is used at attn mask creation time
@@ -2053,10 +2072,8 @@ def test_generate_compile(self, _, end_to_end):
                 "max_new_tokens": 10,
                 "return_dict_in_generate": True,
                 "output_scores": True,
+                "cache_implementation": "static",
             }
-            # end-to-end works best with dynamic cache, forward compilation works best with static cache
-            if not end_to_end:
-                generation_kwargs["cache_implementation"] = "static"
 
             # get eager + dynamic cache results for future comparison
             dynamic_outputs = []
@@ -2067,10 +2084,8 @@ def test_generate_compile(self, _, end_to_end):
             generation_config = copy.deepcopy(model.generation_config)
             generation_config.update(**generation_kwargs)
             torch.compiler.reset()
-            if end_to_end:
-                model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
-            else:
-                model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
+
+            model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
 
             compiled_outputs = []
             for model_inputs in input_ids_sets:
@@ -3777,10 +3792,12 @@ def test_assisted_decoding_in_different_gpu(self):
         self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
         # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
+            torch_device
+        )
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             "cpu"
         )
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index a1ea708efd665b..5556f14a0b93b1 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -27,6 +27,7 @@
 from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
 from transformers.testing_utils import (
     require_torch,
+    require_torch_accelerator,
     require_torch_fp16,
     require_torch_gpu,
     require_torch_multi_accelerator,
@@ -1122,6 +1123,13 @@ def is_pipeline_test_to_skip(
 
     def setUp(self):
         self.model_tester = Blip2ModelTester(self)
+        common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"]
+        self.config_tester = ConfigTester(
+            self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
     def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -1565,7 +1573,7 @@ def test_forward_signature(self):
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip2-itm-vit-g"
         model = Blip2TextModelWithProjection.from_pretrained(model_name)
@@ -2191,7 +2199,7 @@ def test_expansion_in_processing(self):
 
         self.assertTrue(generated_text_expanded == generated_text)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_inference_itm(self):
         model_name = "Salesforce/blip2-itm-vit-g"
         processor = Blip2Processor.from_pretrained(model_name)
@@ -2210,7 +2218,7 @@ def test_inference_itm(self):
         self.assertTrue(torch.allclose(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3))
         self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3))
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_torch_fp16
     def test_inference_itm_fp16(self):
         model_name = "Salesforce/blip2-itm-vit-g"
@@ -2232,7 +2240,7 @@ def test_inference_itm_fp16(self):
         )
         self.assertTrue(torch.allclose(out[0].cpu().float(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3))
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_torch_fp16
     def test_inference_vision_with_projection_fp16(self):
         model_name = "Salesforce/blip2-itm-vit-g"
@@ -2256,7 +2264,7 @@ def test_inference_vision_with_projection_fp16(self):
         ]
         self.assertTrue(np.allclose(out.image_embeds[0][0][:6].tolist(), expected_image_embeds, atol=1e-3))
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_torch_fp16
     def test_inference_text_with_projection_fp16(self):
         model_name = "Salesforce/blip2-itm-vit-g"
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index bb2ba8b3428174..01d4ef720e57d8 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -333,7 +333,7 @@ def test_batching_equivalence(self):
 
     # TODO (joao, raushan): fix me -- the problem is in `cache_position[0] == 0`, i.e. dynamic control flow
     @unittest.skip("Chameleon is not compatible with end-to-end generation compilation")
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile_model_forward(self):
         pass
 
 
diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py
index 8e1a4834d1ed41..144846772fd54c 100644
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -201,7 +201,6 @@ def setUpClass(cls):
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
     @require_read_token
-    @unittest.skip("Cohere2 has not been released yet")
     def test_model_bf16(self):
         model_id = "CohereForAI/command-r7b-12-2024"
         EXPECTED_TEXTS = [
@@ -222,7 +221,6 @@ def test_model_bf16(self):
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @require_read_token
-    @unittest.skip("Cohere2 has not been released yet")
     def test_model_fp16(self):
         model_id = "CohereForAI/command-r7b-12-2024"
         EXPECTED_TEXTS = [
@@ -243,7 +241,6 @@ def test_model_fp16(self):
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @require_read_token
-    @unittest.skip("Cohere2 has not been released yet")
     def test_model_pipeline_bf16(self):
         # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR
         model_id = "CohereForAI/command-r7b-12-2024"
@@ -269,7 +266,6 @@ def test_model_pipeline_bf16(self):
     @require_torch_gpu
     @mark.flash_attn_test
     @slow
-    @unittest.skip("Cohere2 has not been released yet")
     def test_model_flash_attn(self):
         # See https://github.com/huggingface/transformers/issues/31953 --- flash attn was generating garbage for Gemma2, especially in long context
         model_id = "CohereForAI/command-r7b-12-2024"
@@ -291,7 +287,6 @@ def test_model_flash_attn(self):
 
     @slow
     @require_read_token
-    @unittest.skip("Cohere2 has not been released yet")
     def test_export_static_cache(self):
         if version.parse(torch.__version__) < version.parse("2.5.0"):
             self.skipTest(reason="This test requires torch >= 2.5 to run.")
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index d38a479ab36e42..dee93109da2446 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -369,7 +369,7 @@ def test_disk_offload_bin(self):
         pass
 
     @unittest.skip("Dbrx does not support `torch.compile` with `fullgraph=True`.")
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile_model_forward(self):
         pass
 
 
diff --git a/tests/models/diffllama/__init__.py b/tests/models/diffllama/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py
new file mode 100644
index 00000000000000..64dfb5b6495511
--- /dev/null
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -0,0 +1,992 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DiffLlama model."""
+
+import gc
+import tempfile
+import unittest
+
+import pytest
+from packaging import version
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, DiffLlamaConfig, StaticCache, is_torch_available, set_seed
+from transformers.testing_utils import (
+    backend_empty_cache,
+    require_bitsandbytes,
+    require_flash_attn,
+    require_read_token,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DiffLlamaForCausalLM,
+        DiffLlamaForQuestionAnswering,
+        DiffLlamaForSequenceClassification,
+        DiffLlamaForTokenClassification,
+        DiffLlamaModel,
+    )
+    from transformers.models.diffllama.modeling_diffllama import (
+        DiffLlamaRotaryEmbedding,
+    )
+
+
+class DiffLlamaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return DiffLlamaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DiffLlamaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = DiffLlamaModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = DiffLlamaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = DiffLlamaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            DiffLlamaModel,
+            DiffLlamaForCausalLM,
+            DiffLlamaForSequenceClassification,
+            DiffLlamaForQuestionAnswering,
+            DiffLlamaForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (DiffLlamaForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DiffLlamaModel,
+            "text-classification": DiffLlamaForSequenceClassification,
+            "text-generation": DiffLlamaForCausalLM,
+            "zero-shot": DiffLlamaForSequenceClassification,
+            "question-answering": DiffLlamaForQuestionAnswering,
+            "token-classification": DiffLlamaForTokenClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = DiffLlamaForCausalLM if is_torch_available() else None
+
+    def setUp(self):
+        self.model_tester = DiffLlamaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DiffLlamaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_diffllama_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = DiffLlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_diffllama_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = DiffLlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_diffllama_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = DiffLlamaForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_diffllama_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = DiffLlamaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
+    @unittest.skip(reason="DiffLlama buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = DiffLlamaModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = DiffLlamaModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    def test_model_rope_scaling(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        scaling_factor = 10
+        short_input_length = 10
+        long_input_length = int(config.max_position_embeddings * 1.5)
+
+        # Inputs
+        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
+        position_ids_short = position_ids_short.unsqueeze(0)
+        position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
+        position_ids_long = position_ids_long.unsqueeze(0)
+
+        # Sanity check original RoPE
+        original_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device)
+        original_cos_short, original_sin_short = original_rope(x, position_ids_short)
+        original_cos_long, original_sin_long = original_rope(x, position_ids_long)
+        torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])
+
+        # Sanity check linear RoPE scaling
+        # New position "x" should match original position with index "x/scaling_factor"
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+        linear_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device)
+        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
+        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :])
+        for new_position in range(0, long_input_length, scaling_factor):
+            original_position = int(new_position // scaling_factor)
+            torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :])
+            torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :])
+
+        # Sanity check Dynamic NTK RoPE scaling
+        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
+        # with scaling_factor (or that `inv_freq` decreases)
+        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
+        ntk_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device)
+        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
+        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(ntk_cos_short, original_cos_short)
+        torch.testing.assert_close(ntk_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_sin_long, original_sin_long)
+        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
+
+        # Sanity check Yarn RoPE scaling
+        # Scaling should be over the entire input
+        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
+        yarn_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device)
+        yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
+        yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :])
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_short, original_cos_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_long, original_sin_long)
+
+    def test_model_loading_old_rope_configs(self):
+        def _reinitialize_config(base_config, new_kwargs):
+            # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation
+            # steps.
+            base_config_dict = base_config.to_dict()
+            new_config = DiffLlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs})
+            return new_config
+
+        # from untouched config -> ✅
+        base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        original_model = DiffLlamaForCausalLM(base_config).to(torch_device)
+        original_model(**model_inputs)
+
+        # from a config with the expected rope configuration -> ✅
+        config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}})
+        original_model = DiffLlamaForCausalLM(config).to(torch_device)
+        original_model(**model_inputs)
+
+        # from a config with the old rope configuration ('type' instead of 'rope_type')  -> ✅ we gracefully handle BC
+        config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}})
+        original_model = DiffLlamaForCausalLM(config).to(torch_device)
+        original_model(**model_inputs)
+
+        # from a config with both 'type' and 'rope_type'  -> ✅ they can coexist (and both are present in the config)
+        config = _reinitialize_config(
+            base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}}
+        )
+        self.assertTrue(config.rope_scaling["type"] == "linear")
+        self.assertTrue(config.rope_scaling["rope_type"] == "linear")
+        original_model = DiffLlamaForCausalLM(config).to(torch_device)
+        original_model(**model_inputs)
+
+        # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning
+        with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs:
+            config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}})
+            original_model = DiffLlamaForCausalLM(config).to(torch_device)
+            original_model(**model_inputs)
+            self.assertEqual(len(logs.output), 1)
+            self.assertIn("factor field", logs.output[0])
+
+        # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning
+        with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs:
+            config = _reinitialize_config(
+                base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}}
+            )
+            original_model = DiffLlamaForCausalLM(config).to(torch_device)
+            original_model(**model_inputs)
+            self.assertEqual(len(logs.output), 1)
+            self.assertIn("Unrecognized keys", logs.output[0])
+
+        # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception
+        with self.assertRaises(KeyError):
+            config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}})  # missing "factor"
+
+    @require_flash_attn
+    @require_torch_gpu
+    @require_bitsandbytes
+    @pytest.mark.flash_attn_test
+    @require_read_token
+    @slow
+    def test_flash_attn_2_generate_padding_right(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        model = DiffLlamaForCausalLM.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut",
+            load_in_4bit=True,
+            device_map={"": 0},
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("kajuma/DiffLlama-0.3B-handcut")
+
+        texts = ["hi", "Hello this is a very long sentence"]
+
+        tokenizer.padding_side = "right"
+        tokenizer.pad_token = tokenizer.eos_token
+
+        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
+
+        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_native = tokenizer.batch_decode(output_native)
+
+        model = DiffLlamaForCausalLM.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut",
+            load_in_4bit=True,
+            device_map={"": 0},
+            attn_implementation="flash_attention_2",
+        )
+
+        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_fa_2 = tokenizer.batch_decode(output_fa_2)
+
+        self.assertListEqual(output_native, output_fa_2)
+
+    @require_flash_attn
+    @require_torch_gpu
+    @slow
+    @pytest.mark.flash_attn_test
+    def test_use_flash_attention_2_true(self):
+        """
+        NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model = model_class(config)
+                model.save_pretrained(tmp_dir)
+
+                new_model = DiffLlamaForCausalLM.from_pretrained(
+                    tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
+                ).to("cuda")
+
+                self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
+
+                has_flash = False
+                for name, submodule in new_model.named_modules():
+                    if "FlashAttention" in submodule.__class__.__name__:
+                        has_flash = True
+                        break
+                if not has_flash:
+                    raise ValueError("The flash model should have flash attention layers")
+
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        max_new_tokens = 30
+
+        tokenizer = AutoTokenizer.from_pretrained("kajuma/DiffLlama-0.3B-handcut")
+
+        model_sdpa = DiffLlamaForCausalLM.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+        model_eager = DiffLlamaForCausalLM.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",
+        ).to(torch_device)
+
+        self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+        for name, submodule in model_eager.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                raise ValueError("The eager model should not have SDPA attention layers")
+
+        has_sdpa = False
+        for name, submodule in model_sdpa.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                has_sdpa = True
+                break
+        if not has_sdpa:
+            raise ValueError("The SDPA model should have SDPA attention layers")
+
+        texts = [
+            "hi here's a longer context, getting longer and",
+            "Hello this is a very long sentence my friend, very long for real",
+            "Today I am in Paris and",
+        ]
+
+        for padding_side in ["left", "right"]:
+            tokenizer.padding_side = padding_side
+            tokenizer.pad_token = tokenizer.eos_token
+
+            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
+
+            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+            with self.subTest(f"{padding_side}"):
+                torch.testing.assert_close(
+                    res_eager,
+                    res_sdpa,
+                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
+                )
+
+
+@require_torch_accelerator
+class DiffLlamaIntegrationTest(unittest.TestCase):
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    @slow
+    @require_torch_accelerator
+    @require_read_token
+    def test_compile_static_cache(self):
+        # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
+        # work as intended. See https://github.com/pytorch/pytorch/issues/121943
+        if version.parse(torch.__version__) < version.parse("2.3.0"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        NUM_TOKENS_TO_GENERATE = 40
+        # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
+        # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
+        EXPECTED_TEXT_COMPLETION = [
+            "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
+            "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe "
+            "theory of relativ",
+            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, "
+            "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
+        ]
+
+        prompts = [
+            "Simply put, the theory of relativity states that ",
+            "My favorite all time favorite condiment is ketchup.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut", pad_token="</s>", padding_side="right"
+        )
+        model = DiffLlamaForCausalLM.from_pretrained(
+            "kajuma/DiffLlama-0.3B-handcut", device_map=torch_device, torch_dtype=torch.float16
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+
+        # Dynamic Cache
+        generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
+        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text)
+
+        # Static Cache
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
+
+        # Static Cache + compile
+        model._cache = None  # clear cache object, initialized when we pass `cache_implementation="static"`
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
+
+
+@slow
+@require_torch_accelerator
+class Mask4DTestHard(unittest.TestCase):
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def setUp(self):
+        model_name = "kajuma/DiffLlama-0.3B-handcut"
+        self.model_dtype = torch.float32
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = DiffLlamaForCausalLM.from_pretrained(model_name, torch_dtype=self.model_dtype).to(torch_device)
+
+    def get_test_data(self):
+        template = "my favorite {}"
+        items = ("pet is a", "artist plays a", "name is L")  # same number of tokens in each item
+
+        batch_separate = [template.format(x) for x in items]  # 3 separate lines
+        batch_shared_prefix = template.format(" ".join(items))  # 1 line with options concatenated
+
+        input_ids = self.tokenizer(batch_separate, return_tensors="pt").input_ids.to(torch_device)
+        input_ids_shared_prefix = self.tokenizer(batch_shared_prefix, return_tensors="pt").input_ids.to(torch_device)
+
+        mask_shared_prefix = torch.tensor(
+            [
+                [
+                    [
+                        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
+                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1],
+                    ]
+                ]
+            ],
+            device=torch_device,
+        )
+
+        position_ids = torch.arange(input_ids.shape[1]).tile(input_ids.shape[0], 1).to(torch_device)
+
+        # building custom positions ids based on custom mask
+        position_ids_shared_prefix = (mask_shared_prefix.sum(dim=-1) - 1).reshape(1, -1)
+        # effectively: position_ids_shared_prefix = torch.tensor([[0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5]]).to(device)
+
+        # inverting the mask
+        min_dtype = torch.finfo(self.model_dtype).min
+        mask_shared_prefix = (mask_shared_prefix.eq(0.0)).to(dtype=self.model_dtype) * min_dtype
+
+        return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix
+
+    def test_stacked_causal_mask(self):
+        (
+            input_ids,
+            position_ids,
+            input_ids_shared_prefix,
+            mask_shared_prefix,
+            position_ids_shared_prefix,
+        ) = self.get_test_data()
+
+        # regular batch
+        logits = self.model.forward(input_ids, position_ids=position_ids).logits
+        logits_last = logits[:, -1, :]  # last tokens in each batch line
+        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
+
+        # single forward run with 4D custom mask
+        logits_shared_prefix = self.model.forward(
+            input_ids_shared_prefix, attention_mask=mask_shared_prefix, position_ids=position_ids_shared_prefix
+        ).logits
+        logits_shared_prefix_last = logits_shared_prefix[
+            0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], :
+        ]  # last three tokens
+        decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)]
+
+        self.assertEqual(decoded, decoded_shared_prefix)
+
+    def test_partial_stacked_causal_mask(self):
+        # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks
+
+        (
+            input_ids,
+            position_ids,
+            input_ids_shared_prefix,
+            mask_shared_prefix,
+            position_ids_shared_prefix,
+        ) = self.get_test_data()
+
+        # regular batch
+        logits = self.model.forward(input_ids, position_ids=position_ids).logits
+        logits_last = logits[:, -1, :]  # last tokens in each batch line
+        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
+
+        # 2 forward runs with custom 4D masks
+        part_a = 3  # split point
+
+        input_1a = input_ids_shared_prefix[:, :part_a]
+        position_ids_1a = position_ids_shared_prefix[:, :part_a]
+        mask_1a = mask_shared_prefix[:, :, :part_a, :part_a]
+
+        outs_1a = self.model.forward(input_1a, attention_mask=mask_1a, position_ids=position_ids_1a)
+        past_key_values_a = outs_1a["past_key_values"]
+
+        # Case 1: we pass a 4D attention mask regarding the current sequence length (i.e. [..., seq_len, full_len])
+        input_1b = input_ids_shared_prefix[:, part_a:]
+        position_ids_1b = position_ids_shared_prefix[:, part_a:]
+        mask_1b = mask_shared_prefix[:, :, part_a:, :]
+        outs_1b = self.model.forward(
+            input_1b,
+            attention_mask=mask_1b,
+            position_ids=position_ids_1b,
+            past_key_values=past_key_values_a,
+        )
+        decoded_1b = [
+            self.tokenizer.decode(t)
+            for t in outs_1b.logits.argmax(-1)[
+                0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a
+            ]
+        ]
+        self.assertEqual(decoded, decoded_1b)
+
+    def test_stacked_causal_mask_static_cache(self):
+        """same as above but with StaticCache"""
+        (
+            input_ids,
+            position_ids,
+            input_ids_shared_prefix,
+            mask_shared_prefix,
+            position_ids_shared_prefix,
+        ) = self.get_test_data()
+
+        # regular batch
+        logits = self.model.forward(input_ids, position_ids=position_ids).logits
+        logits_last = logits[:, -1, :]  # last tokens in each batch line
+        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
+
+        # upgrade the model with StaticCache
+        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
+        past_key_values = StaticCache(
+            config=self.model.config,
+            batch_size=1,
+            max_cache_len=max_cache_len,
+            device=torch_device,
+            dtype=self.model.dtype,
+        )
+
+        padded_attention_mask = torch.nn.functional.pad(
+            input=mask_shared_prefix,
+            pad=(0, max_cache_len - mask_shared_prefix.shape[-1]),
+            mode="constant",
+            value=torch.finfo(self.model_dtype).min,
+        )
+
+        # single forward run with 4D custom mask
+        logits_shared_prefix = self.model.forward(
+            input_ids_shared_prefix,
+            attention_mask=padded_attention_mask,
+            position_ids=position_ids_shared_prefix,
+            cache_position=torch.arange(input_ids_shared_prefix.shape[-1], device=torch_device),
+            past_key_values=past_key_values,
+        ).logits
+        logits_shared_prefix_last = logits_shared_prefix[
+            0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], :
+        ]  # last three tokens
+        decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)]
+
+        self.assertEqual(decoded, decoded_shared_prefix)
+
+    def test_partial_stacked_causal_mask_static_cache(self):
+        # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks
+        # we pass a 4D attention mask shaped [..., seq_len, full_static_cache_len])
+        (
+            input_ids,
+            position_ids,
+            input_ids_shared_prefix,
+            mask_shared_prefix,
+            position_ids_shared_prefix,
+        ) = self.get_test_data()
+
+        # regular batch
+        logits = self.model.forward(input_ids, position_ids=position_ids).logits
+        logits_last = logits[:, -1, :]  # last tokens in each batch line
+        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
+
+        # upgrade the model with StaticCache
+        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
+        past_key_values = StaticCache(
+            config=self.model.config,
+            batch_size=1,
+            max_cache_len=max_cache_len,
+            device=torch_device,
+            dtype=self.model.dtype,
+        )
+
+        # forward run for the first part of input
+        part_a = 3  # split point
+
+        input_1a = input_ids_shared_prefix[:, :part_a]
+        position_ids_1a = position_ids_shared_prefix[:, :part_a]
+        mask_1a = mask_shared_prefix[:, :, :part_a, :part_a]
+
+        padded_mask_1a = torch.nn.functional.pad(
+            input=mask_1a,
+            pad=(0, max_cache_len - mask_1a.shape[-1]),
+            mode="constant",
+            value=torch.finfo(self.model_dtype).min,
+        )
+
+        _ = self.model.forward(
+            input_1a,
+            attention_mask=padded_mask_1a,
+            position_ids=position_ids_1a,
+            cache_position=torch.arange(part_a, device=torch_device),
+            past_key_values=past_key_values,
+        )
+
+        # forward run for the second part of input
+        input_1b = input_ids_shared_prefix[:, part_a:]
+        position_ids_1b = position_ids_shared_prefix[:, part_a:]
+        mask_1b = mask_shared_prefix[:, :, part_a:, :]
+
+        padded_mask_1b = torch.nn.functional.pad(
+            input=mask_1b, pad=(0, max_cache_len - mask_1b.shape[-1]), mode="constant", value=0
+        )
+
+        outs_1b = self.model.forward(
+            input_1b,
+            attention_mask=padded_mask_1b,
+            position_ids=position_ids_1b,
+            cache_position=torch.arange(
+                part_a,
+                input_ids_shared_prefix.shape[-1],
+                device=torch_device,
+            ),
+            past_key_values=past_key_values,
+        )
+        decoded_1b = [
+            self.tokenizer.decode(t)
+            for t in outs_1b.logits.argmax(-1)[
+                0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a
+            ]
+        ]
+        self.assertEqual(decoded, decoded_1b)
diff --git a/tests/models/emu3/__init__.py b/tests/models/emu3/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py
new file mode 100644
index 00000000000000..007207c0694290
--- /dev/null
+++ b/tests/models/emu3/test_modeling_emu3.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch emu3 model."""
+
+import unittest
+
+import numpy as np
+import requests
+from huggingface_hub import hf_hub_download
+from parameterized import parameterized
+
+from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available, set_seed
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Emu3ForCausalLM,
+        Emu3ForConditionalGeneration,
+        Emu3Processor,
+        Emu3TextModel,
+    )
+
+
+class Emu3Text2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = input_ids.ne(1).to(torch_device)
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return Emu3TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            max_position_embeddings=self.max_position_embeddings,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Emu3Text2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Emu3ForCausalLM,) if is_torch_available() else ()
+    all_generative_model_classes = (Emu3ForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "text-generation": Emu3ForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    def setUp(self):
+        self.model_tester = Emu3Text2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Emu3TextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = Emu3TextModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = Emu3TextModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    @unittest.skip("Doesn't work, tensors are not almost same")  # TODO raushan fixme
+    def test_custom_4d_attention_mask(self):
+        pass
+
+
+class Emu3Vision2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        image_token_id=3,
+        image_size=30,
+        codebook_size=20,
+        temporal_downsample_factor=1,
+        base_channels=32,
+        vq_channel_multiplier=[1, 1],
+        image_seq_length=100,
+        vq_img_token_start_id=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.image_token_id = image_token_id
+        self.image_size = image_size
+        self.codebook_size = codebook_size
+        self.temporal_downsample_factor = temporal_downsample_factor
+        self.vq_channel_multiplier = vq_channel_multiplier
+        self.vq_img_token_start_id = vq_img_token_start_id
+        self.base_channels = base_channels
+        self.seq_length = seq_length + image_seq_length
+        self.image_seq_length = image_seq_length
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size)
+        attention_mask = input_ids.ne(1).to(torch_device)
+        input_ids[input_ids == self.image_token_id] = self.pad_token_id
+        input_ids[:, : self.image_seq_length] = self.image_token_id
+
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                3,
+                self.image_size,
+                self.image_size,
+            ]
+        )
+        image_sizes = [[self.image_size, self.image_size]] * self.batch_size
+        image_sizes = torch.tensor(image_sizes, device=torch_device, dtype=torch.int64)
+
+        return config, input_ids, attention_mask, pixel_values, image_sizes
+
+    def get_config(self):
+        # create dummy vocab map for image2bpe mapping if it needs remapping
+        # we assume that vocab size is big enough to account for `codebook_size` amount of
+        # image tokens somewhere at the beginning of total vocab size
+
+        vocab_map = {i: chr(i) for i in range(self.vocab_size)}
+        start = self.vq_img_token_start_id
+        end = self.vq_img_token_start_id + self.codebook_size
+        for i in range(start, end):
+            # dummy str for each token, anything that fits pattern "<|visual token XXXXXX|>"
+            vocab_map[i] = f"<|visual token{i:06d}|>"
+
+        # add tokens that have to be in the vocab, we'll retrieve their ids later in modeling code
+        vocab_map[self.image_token_id] = "<image>"
+        vocab_map[self.image_token_id + 1] = "<|extra_200|>"
+        vocab_map = {v: k for k, v in vocab_map.items()}
+
+        text_config = Emu3TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+        vq_config = {
+            "codebook_size": self.codebook_size,
+            "temporal_downsample_factor": self.temporal_downsample_factor,
+            "base_channels": self.base_channels,
+            "channel_multiplier": self.vq_channel_multiplier,
+            "hidden_size": self.base_channels,
+        }
+        return Emu3Config(text_config=text_config, vq_config=vq_config, vocabulary_map=vocab_map)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            pixel_values,
+            image_sizes,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {}
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    def setUp(self):
+        self.model_tester = Emu3Vision2TextModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Emu3Config, has_text_modality=False, common_properties=["vocabulary_map"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
+    @unittest.skip(
+        "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module"
+    )
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip(
+        "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module"
+    )
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip(
+        "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module"
+    )
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip("Doesn't work, tensors are not almost same")  # TODO raushan fixme
+    def test_custom_4d_attention_mask(self):
+        pass
+
+    @unittest.skip("VQ-VAE module doesn't initialize weights properly")
+    def test_initialization(self):
+        pass
+
+
+@require_torch
+class Emu3IntegrationTest(unittest.TestCase):
+    @slow
+    @require_bitsandbytes
+    def test_model_generation(self):
+        model = Emu3ForConditionalGeneration.from_pretrained(
+            "Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
+        )
+        processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        prompt = "USER: <image>Describe what do you see here and tell me about the history behind it? ASSISTANT:"
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)
+
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = ['USER: 114*143Describe what do you see here and tell me about the history behind it? ASSISTANT: The image depicts the constellation of Ursa Minor, also known as the Little Bear. This constellation was one of the 24 modern constellations introduced by Charles Messier in 178']  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    @require_bitsandbytes
+    def test_model_generation_batched(self):
+        model = Emu3ForConditionalGeneration.from_pretrained(
+            "Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
+        )
+        processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+        processor.tokenizer.padding_side = "left"
+
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompts = [
+            "USER: <image>Describe what do you see here and tell me about the history behind it? ASSISTANT:",
+            "USER: <image>What do you know about the constellation in this image? ASSISTANT:",
+        ]
+
+        inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to(
+            model.device, torch.float16
+        )
+
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = [
+            'USER: 114*143Describe what do you see here and tell me about the history behind it? ASSISTANT: The image depicts the constellation of Ursa Minor, also known as the Little Bear. This constellation was one of the 24 modern constellations introduced by Charles Messier in 178',
+            'USER: 75*125What do you know about the constellation in this image? ASSISTANT: The image shows a segment of a wire rope, characterized by its consistent pattern and regular twists, indicative of a high-quality, well-made rope. This type of detail suggests careful manufacturing processes and attention to'
+            ]  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    @require_bitsandbytes
+    def test_model_generation_multi_image(self):
+        model = Emu3ForConditionalGeneration.from_pretrained(
+            "Emu3-community/Emu3-Chat-hf", load_in_4bit=True, device_map="auto"
+        )
+        processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompt = "USER: <image><image>What do these two images have in common? ASSISTANT:"
+
+        inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
+
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = ['USER: 114*14375*125What do these two images have in common? ASSISTANT: The two images both depict a geometric shape - a triangle in the larger image and a line segment in the smaller image. They share a common feature of being created with a series of connected dots, which']  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    @require_bitsandbytes
+    def test_model_generate_images(self):
+        model = Emu3ForConditionalGeneration.from_pretrained(
+            "Emu3-community/Emu3-Gen-hf", load_in_4bit=True, device_map="auto"
+        )
+        processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+
+        inputs = processor(
+            text=["a portrait of young girl. masterpiece, film grained, best quality."],
+            padding=True,
+            return_tensors="pt",
+            return_for_image_generation=True,
+        ).to(model.device)
+        self.assertTrue(inputs.input_ids.shape[1] == 23)
+
+        image_sizes = inputs.pop("image_sizes")
+        HEIGHT, WIDTH = image_sizes[0]
+        VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
+
+        def prefix_allowed_tokens_fn(batch_id, input_ids):
+            height, width = HEIGHT, WIDTH
+            visual_tokens = VISUAL_TOKENS
+            image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device)
+            eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device)
+            eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device)
+            pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device)
+            eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device)
+            eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
+
+            position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0]
+            offset = input_ids.shape[0] - position
+            if offset % (width + 1) == 0:
+                return (eol_token_id,)
+            elif offset == (width + 1) * height + 1:
+                return (eof_token_id,)
+            elif offset == (width + 1) * height + 2:
+                return (eoi_token_id,)
+            elif offset == (width + 1) * height + 3:
+                return (eos_token_id,)
+            elif offset > (width + 1) * height + 3:
+                return (pad_token_id,)
+            else:
+                return visual_tokens
+
+        out = model.generate(
+            **inputs,
+            max_new_tokens=50_000,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            do_sample=False,
+        )
+        self.assertTrue(out.shape[1] == 8216)
+
+        image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH)
+        images = processor.postprocess(list(image.float()), return_tensors="np")
+        self.assertTrue(images["pixel_values"].shape == (3, 720, 720))
+        self.assertTrue(isinstance(images["pixel_values"], np.ndarray))
+
+        filepath = hf_hub_download(
+            repo_id="raushan-testing-hf/images_test",
+            filename="emu3_generated_pixels.npy",
+            repo_type="dataset",
+        )
+        original_pixels = np.load(filepath)
+        self.assertTrue(np.allclose(original_pixels, images["pixel_values"]))
diff --git a/tests/models/emu3/test_processor_emu3.py b/tests/models/emu3/test_processor_emu3.py
new file mode 100644
index 00000000000000..7bc77075b1a69b
--- /dev/null
+++ b/tests/models/emu3/test_processor_emu3.py
@@ -0,0 +1,85 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch emu3 model."""
+
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import Emu3Processor, GPT2TokenizerFast
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import Emu3ImageProcessor
+
+
+class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Emu3Processor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = Emu3ImageProcessor()
+        extra_special_tokens = extra_special_tokens = {
+            "image_token": "<image>",
+            "boi_token": "<|image start|>",
+            "eoi_token": "<|image end|>",
+            "image_wrapper_token": "<|image token|>",
+            "eof_token": "<|extra_201|>",
+        }
+        tokenizer = GPT2TokenizerFast.from_pretrained(
+            "openai-community/gpt2", extra_special_tokens=extra_special_tokens
+        )
+        tokenizer.pad_token_id = 0
+        tokenizer.sep_token_id = 1
+        processor = self.processor_class(
+            image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template"
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+    def test_processor_for_generation(self):
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+
+        # we don't need an image as input because the model will generate one
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, return_for_image_generation=True, return_tensors="pt")
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "image_sizes"])
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 8)
+
+        # when `return_for_image_generation` is set, we raise an error that image should not be provided
+        with self.assertRaises(ValueError):
+            inputs = processor(
+                text=input_str, images=image_input, return_for_image_generation=True, return_tensors="pt"
+            )
+
+    def test_processor_postprocess(self):
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+
+        input_str = "lower newer"
+        orig_image_input = self.prepare_image_inputs()
+        orig_image = np.array(orig_image_input).transpose(2, 0, 1)
+
+        inputs = processor(text=input_str, images=orig_image, do_resize=False, return_tensors="np")
+        normalized_image_input = inputs.pixel_values
+        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="np")["pixel_values"]
+
+        # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
+        self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 2aac4dba82e897..47931cbe45c0d9 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -39,7 +39,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import EncodecModel
+    from transformers import EncodecFeatureExtractor, EncodecModel
 
 
 def prepare_inputs_dict(
@@ -111,6 +111,19 @@ def prepare_config_and_inputs_for_model_class(self, model_class):
 
         return config, inputs_dict
 
+    def prepare_config_and_inputs_for_normalization(self):
+        input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+        config = self.get_config()
+        config.normalize = True
+
+        processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate)
+        input_values = list(input_values.cpu().numpy())
+        inputs_dict = processor(
+            input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        return config, inputs_dict
+
     def get_config(self):
         return EncodecConfig(
             audio_channels=self.num_channels,
@@ -125,9 +138,7 @@ def get_config(self):
 
     def create_and_check_model_forward(self, config, inputs_dict):
         model = EncodecModel(config=config).to(torch_device).eval()
-
-        input_values = inputs_dict["input_values"]
-        result = model(input_values)
+        result = model(**inputs_dict)
         self.parent.assertEqual(
             result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
         )
@@ -435,6 +446,10 @@ def test_identity_shortcut(self):
         config.use_conv_shortcut = False
         self.model_tester.create_and_check_model_forward(config, inputs_dict)
 
+    def test_model_forward_with_normalization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_normalization()
+        self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
 
 def normalize(arr):
     norm = np.linalg.norm(arr)
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index f02e8f167636eb..eb1205db9cc122 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -23,7 +23,7 @@
 from transformers.testing_utils import (
     require_bitsandbytes,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     require_torch_multi_gpu,
     slow,
     torch_device,
@@ -426,7 +426,7 @@ def recursive_check(tuple_object, dict_object):
 
 
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class FalconMambaIntegrationTests(unittest.TestCase):
     def setUp(self):
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index bcac135be7210b..0444ad14f26976 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 
 from transformers import FuyuConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -327,7 +327,7 @@ def test_model_parallelism(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class FuyuModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_processor(self):
diff --git a/tests/models/helium/__init__.py b/tests/models/helium/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/helium/test_modeling_helium.py b/tests/models/helium/test_modeling_helium.py
new file mode 100644
index 00000000000000..3ad2cf736678aa
--- /dev/null
+++ b/tests/models/helium/test_modeling_helium.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Helium model."""
+
+import unittest
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, HeliumConfig, is_torch_available
+from transformers.testing_utils import (
+    require_read_token,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ..gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        HeliumForCausalLM,
+        HeliumForSequenceClassification,
+        HeliumForTokenClassification,
+        HeliumModel,
+    )
+
+
+class HeliumModelTester(GemmaModelTester):
+    if is_torch_available():
+        config_class = HeliumConfig
+        model_class = HeliumModel
+        for_causal_lm_class = HeliumForCausalLM
+        for_sequence_class = HeliumForSequenceClassification
+        for_token_class = HeliumForTokenClassification
+
+
+@require_torch
+class HeliumModelTest(GemmaModelTest, unittest.TestCase):
+    all_model_classes = (
+        (HeliumModel, HeliumForCausalLM, HeliumForSequenceClassification, HeliumForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (HeliumForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": HeliumModel,
+            "text-classification": HeliumForSequenceClassification,
+            "token-classification": HeliumForTokenClassification,
+            "text-generation": HeliumForCausalLM,
+            "zero-shot": HeliumForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    _is_stateful = True
+    model_split_percents = [0.5, 0.6]
+
+    def setUp(self):
+        self.model_tester = HeliumModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=HeliumConfig, hidden_size=37)
+
+
+@slow
+# @require_torch_gpu
+class HeliumIntegrationTest(unittest.TestCase):
+    input_text = ["Hello, today is a great day to"]
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    @require_read_token
+    def test_model_2b(self):
+        model_id = "kyutai/helium-1-preview"
+        EXPECTED_TEXTS = [
+            "Hello, today is a great day to start a new project. I have been working on a new project for a while now and I have"
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision="refs/pr/1"
+        ).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, revision="refs/pr/1")
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 94229b13d2cbfe..a8f1304b6fc743 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -781,7 +781,7 @@ def test_custom_4d_attention_mask(self):
         pass
 
     @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs")
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile_model_forward(self):
         pass
 
     @unittest.skip(reason="We only test the model that takes in multiple images")
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index baacc12caa073d..335b690d879f18 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -158,7 +158,10 @@ class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
     def setUp(self):
         self.model_tester = InstructBlipVisionModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=InstructBlipVisionConfig, has_text_modality=False, hidden_size=37
+            self,
+            config_class=InstructBlipConfig,
+            has_text_modality=False,
+            common_properties=["num_query_tokens", "image_token_index"],
         )
 
     def test_config(self):
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 3be5f89325cf38..6b59a9878aa4f4 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -163,8 +163,9 @@ class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = InstructBlipVideoVisionModelTester(self)
+        common_properties = ["num_query_tokens", "video_token_index"]
         self.config_tester = ConfigTester(
-            self, config_class=InstructBlipVideoVisionConfig, has_text_modality=False, hidden_size=37
+            self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties
         )
 
     def test_config(self):
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index feca640bb4a119..664616306d88fc 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -26,7 +26,6 @@
     require_read_token,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -541,7 +540,7 @@ def _reinitialize_config(base_config, new_kwargs):
             config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}})  # missing "factor"
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class LlamaIntegrationTest(unittest.TestCase):
     # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
     # Depending on the hardware we get different logits / generations
@@ -695,7 +694,7 @@ def test_model_7b_dola_generation(self):
         self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_read_token
     def test_compile_static_cache(self):
         # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index b4a959a00d2a0c..e91b76f7d9f5a6 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -327,10 +327,7 @@ def test_small_model_integration_test(self):
         prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
         image_file = "https://llava-vl.github.io/static/images/view.jpg"
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
-
-        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
-        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20)
         EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
@@ -378,7 +375,7 @@ def test_small_model_integration_test_llama_batched(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
+        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -402,7 +399,9 @@ def test_small_model_integration_test_batch(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
+        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
+            torch_device
+        )
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -434,7 +433,9 @@ def test_small_model_integration_test_llama_batched_regression(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
+        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to(
+            torch_device
+        )
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -508,32 +509,18 @@ def test_llava_merge_inputs_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
         model_id = "llava-hf/llava-1.5-7b-hf"
         model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
 
-        # Simulate some user inputs
-        pixel_values = torch.randn(
-            (1, 3, 336, 336),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        input_ids = torch.tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
-        )
+        prompt = "USER: <image>\nDescribe the imageASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that the loss is properly computed
         loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
+            **inputs,
+            labels=inputs.input_ids.clone(),
         ).loss
         loss.backward()
 
@@ -593,38 +580,6 @@ def test_generation_siglip_backbone(self):
         EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
         self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
 
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "llava-hf/llava-1.5-7b-hf"
-        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.num_additional_image_tokens = 1
-        processor.patch_size = 14
-        inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 18)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
     @slow
     @require_bitsandbytes
     def test_pixtral(self):
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index d3a66a16df9a64..fd0ba8cacc186f 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -17,8 +17,8 @@
 import unittest
 
 from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -26,6 +26,9 @@
 if is_vision_available():
     from transformers import CLIPImageProcessor
 
+if is_torch_available:
+    import torch
+
 
 @require_vision
 class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@@ -50,7 +53,7 @@ def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template"}
+        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
@@ -94,6 +97,55 @@ def test_chat_template(self):
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
 
+    def test_chat_template_dict(self):
+        processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = [[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 338, 4318, 297, 445, 1967, 29973, 319, 1799, 9047, 13566, 29901]]  # fmt: skip
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add image URL for return dict
+        messages[0]["content"][0] = {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        out_dict_with_image = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertListEqual(list(out_dict_with_image.keys()), ["input_ids", "attention_mask", "pixel_values"])
+
+    @require_torch
+    def test_chat_template_dict_torch(self):
+        processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+        out_dict_tensors = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values"])
+        self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor))
+
     def test_chat_template_with_continue_final_message(self):
         processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
         expected_prompt = "USER: <image>\nDescribe this image. ASSISTANT: There is a dog and"
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 14b0fb8cc07db7..c90dbe056e51bb 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -396,8 +396,10 @@ def test_small_model_integration_test(self):
         )
         original_input_ids = torch.load(filepath, map_location="cpu")
         # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
-        original_input_ids[original_input_ids == -200] = model.config.image_token_index
-        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
+        # remove image token indices because HF impl expands image tokens `image_seq_length` times
+        original_input_ids = original_input_ids[original_input_ids != -200]
+        observed_input_ids = inputs.input_ids[inputs.input_ids != model.config.image_token_index]
+        assert original_input_ids[0].tolist() == observed_input_ids[0].tolist()
 
         filepath = hf_hub_download(
             repo_id="nielsr/test-image",
@@ -414,7 +416,7 @@ def test_small_model_integration_test(self):
 
         expected_slice = torch.tensor(
             [[-4.7695, -4.5664, -0.2788], [-10.6172, -10.8828, -2.5273], [-6.7383, -7.2422, -0.6694]],
-            dtype=torch.float32,
+            dtype=torch.float16,
             device=torch_device,
         )
         assert torch.allclose(output.logits[0, :3, :3], expected_slice, atol=1e-3)
@@ -518,11 +520,11 @@ def test_small_model_integration_test_batch_different_resolutions(self):
 
         expected_slice = torch.tensor(
             [[-0.1287, -0.1294, -0.1284], [-0.2744, -0.2698, -0.2671], [-0.1071, -0.1091, -0.1056]],
-            dtype=torch.float32,
+            dtype=torch.float16,
             device=torch_device,
         )
         assert torch.allclose(output.logits[0, -3:, -3:], expected_slice, atol=1e-3)
-        assert torch.allclose(output.loss, torch.tensor(7.0206, device=torch_device), atol=1e-3)
+        assert torch.allclose(output.loss, torch.tensor(7.0206, dtype=torch.float16, device=torch_device), atol=1e-3)
 
         # verify generation
         output = model.generate(**inputs, max_new_tokens=50)
@@ -601,80 +603,6 @@ def test_padding_side_when_merging_inputs(self):
 
             self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
 
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing_multiimage(self):
-        model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
-        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        deer_image = Image.open(
-            requests.get(
-                "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
-                stream=True,
-            ).raw
-        )
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
-            torch_device, torch.float16
-        )
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3969)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
-            torch_device, torch.float16
-        )
-        self.assertTrue(inputs.input_ids.shape[-1] == 23)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
-        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 17)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_full_vision_state_selection(self):
@@ -685,7 +613,7 @@ def test_small_model_integration_test_full_vision_state_selection(self):
         # test that changing `strategy` won't error out
         model.vision_feature_select_strategy = "full"
 
-        inputs = self.processor(self.prompt, self.image, return_tensors="pt")
+        inputs = self.processor(self.prompt, self.image, return_tensors="pt").to(model.device)
 
         # verify generation
         output = model.generate(**inputs, max_new_tokens=30)
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index 45faa24526305c..234e4791100054 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -27,7 +27,7 @@
 
 
 if is_vision_available():
-    from transformers import CLIPImageProcessor
+    from transformers import LlavaNextImageProcessor
 
 
 @require_vision
@@ -37,7 +37,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor()
+        image_processor = LlavaNextImageProcessor()
         tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
         processor_kwargs = self.prepare_processor_dict()
         processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
@@ -50,7 +50,7 @@ def get_image_processor(self, **kwargs):
         return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template"}
+        return {"chat_template": "dummy_template", "patch_size": 3, "vision_feature_select_strategy": "default"}
 
     @unittest.skip(
         "Skip because the model has no processor kwargs except for chat template and"
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index c431f91bf5102f..b0234fef34e806 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -17,7 +17,6 @@
 import unittest
 
 import numpy as np
-import requests
 from huggingface_hub import hf_hub_download
 
 from transformers import (
@@ -543,107 +542,3 @@ def test_padding_side_when_merging_inputs(self):
                 model(**inputs_batched, output_hidden_states=True)
 
             self.assertIn("Padding side is set to 'right' but the model is in inference mode. For correct", logs)
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
-            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
-        )
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
-        self.assertTrue(inputs.input_ids.shape[-1] == 19)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing_images(self):
-        model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
-            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
-        )
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2652)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(self.prompt_image, images=[self.image], return_tensors="pt").to(torch_device)
-        self.assertTrue(inputs.input_ids.shape[-1] == 19)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing_multiimage(self):
-        model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
-            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
-        )
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image><image>\nDescribe the similarity between the two images:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        deer_image = Image.open(
-            requests.get(
-                "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e",
-                stream=True,
-            ).raw
-        )
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
-            torch_device, torch.float16
-        )
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 3968)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(text=prompt, images=[raw_image, deer_image], return_tensors="pt").to(
-            torch_device, torch.float16
-        )
-        self.assertTrue(inputs.input_ids.shape[-1] == 22)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 55f5980bfa1579..04aafa11a8b04e 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -16,8 +16,8 @@
 import tempfile
 import unittest
 
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -31,6 +31,9 @@
         Qwen2TokenizerFast,
     )
 
+if is_torch_available:
+    import torch
+
 
 @require_vision
 class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@@ -100,3 +103,60 @@ def test_chat_template(self):
 
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
+
+    @require_av
+    def test_chat_template_dict(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video"},
+                    {"type": "text", "text": "What is shown in this video?"},
+                ],
+            },
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        expected_output = [[151644, 872, 220, 151647, 198, 3838, 374, 6839, 304, 419, 2766, 30, 151645, 151644, 77091, 198]]  # fmt: skip
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add image URL for return dict
+        messages[0]["content"][0] = {
+            "type": "video",
+            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertListEqual(list(out_dict_with_video.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
+
+    @require_torch
+    @require_av
+    def test_chat_template_dict_torch(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                    },
+                    {"type": "text", "text": "What is shown in this video?"},
+                ],
+            },
+        ]
+
+        out_dict_tensors = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
+        self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor))
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index d9e6b9d7bfe7c0..70de4d9cf1edf0 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -424,7 +424,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
         self.skipTest(reason="Mistral flash attention does not support right padding")
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class MistralIntegrationTest(unittest.TestCase):
     # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
     # Depending on the hardware we get different logits / generations
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 9abbf444d0b0b4..cf192b8bd79ec1 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -22,6 +22,7 @@
 from transformers.testing_utils import (
     require_flash_attn,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     slow,
     torch_device,
@@ -471,7 +472,7 @@ def setUpClass(cls):
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_small_model_logits(self):
         model_id = "hf-internal-testing/Mixtral-tiny"
         dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
@@ -507,7 +508,7 @@ def test_small_model_logits(self):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_small_model_logits_batched(self):
         model_id = "hf-internal-testing/Mixtral-tiny"
         dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device)
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index a48a7a2e6da4d2..795a361e22648f 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -110,32 +110,34 @@ def test_apply_chat_template(self):
         ]
         input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
         expected_ids = [
-            128000,  # <|begin_of_text|>
-            128006,  # <|start_header_id|>
-            9125,  # "system"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
-            2028,
-            374,
-            264,
-            1296,
-            11914,
-            13,  # "This is a test sentence."
-            128009,  # <|eot_id|>
-            128006,  # <|start_header_id|>
-            882,  # "user"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
-            2028,
-            374,
-            264,
-            2077,
-            13,  # "This is a response.",
-            128009,  # <|eot_id|>
-            128006,  # <|start_header_id|>
-            78191,  # "assistant"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
+            [
+                128000,  # <|begin_of_text|>
+                128006,  # <|start_header_id|>
+                9125,  # "system"
+                128007,  # <|end_of_header|>
+                271,  # "\n\n"
+                2028,
+                374,
+                264,
+                1296,
+                11914,
+                13,  # "This is a test sentence."
+                128009,  # <|eot_id|>
+                128006,  # <|start_header_id|>
+                882,  # "user"
+                128007,  # <|end_of_header|>
+                271,  # "\n\n"
+                2028,
+                374,
+                264,
+                2077,
+                13,  # "This is a response.",
+                128009,  # <|eot_id|>
+                128006,  # <|start_header_id|>
+                78191,  # "assistant"
+                128007,  # <|end_of_header|>
+                271,  # "\n\n"
+            ]
         ]
 
         self.assertEqual(input_ids, expected_ids)
@@ -146,9 +148,9 @@ def test_apply_chat_template(self):
                 "role": "user",
                 "content": [
                     {"type": "text", "text": "Describe this image in two sentences"},
-                    {"type": "image"},
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                     {"type": "text", "text": " Test sentence   "},
-                    {"type": "image"},
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                     {"type": "text", "text": "ok\n"},
                 ],
             }
@@ -164,10 +166,10 @@ def test_apply_chat_template(self):
 
         input_ids = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
         # fmt: off
-        expected_ids = [
+        expected_ids = [[
             128000, 128006, 882, 128007, 271, 75885, 420, 2217, 304, 1403, 23719, 128256,
             3475, 11914, 262, 128256, 564, 198, 128009, 128006, 78191, 128007, 271,
-        ]
+        ]]
         # fmt: on
         self.assertEqual(input_ids, expected_ids)
 
diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
index a04cfba45f645a..31153173d2f7cd 100644
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -214,6 +214,8 @@ def test_model_from_pretrained(self):
         model = MobileNetV1Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # TODO: ydshieh
+    @unittest.skip("skip for now as #35564 fails this test more frequently for this model")
     @is_flaky(description="is_flaky https://github.com/huggingface/transformers/pull/31258")
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 4fce0cd86352f0..9f286cf3985f0a 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -16,8 +16,9 @@
 import unittest
 
 import pytest
+from packaging import version
 
-from transformers import ModernBertConfig, is_torch_available
+from transformers import AutoTokenizer, ModernBertConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
     CaptureLogger,
@@ -146,7 +147,11 @@ def get_config(self):
             # If we're testing `test_retain_grad_hidden_states_attentions`, we normally get an error
             # that compilation doesn't work. Users can then set compile=False when loading the model,
             # much like here. We're testing whether it works once they've done that.
-            if test_name == "test_retain_grad_hidden_states_attentions":
+
+            # If we're testing `test_inputs_embeds_matches_input_ids`, then we'd like to test with `reference_compile`
+            # set to False, otherwise the input_ids with compiled input embeddings will not match the inputs_embeds
+            # with atol=1e-8 and rtol=1e-5
+            if test_name in ("test_retain_grad_hidden_states_attentions", "test_inputs_embeds_matches_input_ids"):
                 config.reference_compile = False
             # Some tests require attentions to be outputted, in that case we'll set the attention implementation to eager
             # as the others don't support outputted attentions
@@ -294,10 +299,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("ModernBert doesn't use `inputs_embeds` as input.")
-    def test_inputs_embeds(self):
-        pass
-
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
@@ -362,6 +363,131 @@ def test_flash_attn_2_conversion(self):
 
 @require_torch
 class ModernBertModelIntegrationTest(unittest.TestCase):
-    """
-    These still need to be written, once public models are available.
-    """
+    @slow
+    def test_inference_masked_lm(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        model = ModernBertForMaskedLM.from_pretrained(
+            "answerdotai/ModernBERT-base", reference_compile=False, attn_implementation="sdpa"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+
+        inputs = tokenizer("Hello World!", return_tensors="pt")
+        with torch.no_grad():
+            output = model(**inputs)[0]
+        expected_shape = torch.Size((1, 5, 50368))
+        self.assertEqual(output.shape, expected_shape)
+
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[3.8387, -0.2017, 12.2839], [3.6300, 0.6869, 14.7123], [-5.1137, -3.8122, 11.9874]]]
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        model = ModernBertModel.from_pretrained(
+            "answerdotai/ModernBERT-base", reference_compile=False, attn_implementation="sdpa"
+        )
+        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+
+        inputs = tokenizer("Hello World!", return_tensors="pt")
+        with torch.no_grad():
+            output = model(**inputs)[0]
+        expected_shape = torch.Size((1, 5, 768))
+        self.assertEqual(output.shape, expected_shape)
+
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[0.3151, -0.6417, -0.7027], [-0.7834, -1.5810, 0.4576], [1.0614, -0.7268, -0.0871]]]
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_token_classification(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        model = ModernBertForTokenClassification.from_pretrained(
+            "hf-internal-testing/tiny-random-ModernBertForTokenClassification",
+            reference_compile=False,
+            attn_implementation="sdpa",
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-ModernBertForTokenClassification")
+
+        inputs = tokenizer("Hello World!", return_tensors="pt")
+        with torch.no_grad():
+            output = model(**inputs)[0]
+        expected_shape = torch.Size((1, 5, 2))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected = torch.tensor(
+            [[[2.0159, 4.6569], [-0.9430, 3.1595], [-3.8770, 3.2653], [1.5752, 4.5167], [-1.6939, 1.2524]]]
+        )
+        self.assertTrue(torch.allclose(output, expected, atol=1e-4))
+
+    @slow
+    def test_inference_sequence_classification(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        model = ModernBertForSequenceClassification.from_pretrained(
+            "hf-internal-testing/tiny-random-ModernBertForSequenceClassification",
+            reference_compile=False,
+            attn_implementation="sdpa",
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            "hf-internal-testing/tiny-random-ModernBertForSequenceClassification"
+        )
+
+        inputs = tokenizer("Hello World!", return_tensors="pt")
+        with torch.no_grad():
+            output = model(**inputs)[0]
+        expected_shape = torch.Size((1, 2))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected = torch.tensor([[1.6466, 4.5662]])
+        self.assertTrue(torch.allclose(output, expected, atol=1e-4))
+
+    @slow
+    def test_export(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        bert_model = "answerdotai/ModernBERT-base"
+        device = "cpu"
+        attn_implementation = "sdpa"
+        max_length = 512
+
+        tokenizer = AutoTokenizer.from_pretrained(bert_model)
+        inputs = tokenizer(
+            "the man worked as a [MASK].",
+            return_tensors="pt",
+            padding="max_length",
+            max_length=max_length,
+        )
+
+        model = ModernBertForMaskedLM.from_pretrained(
+            bert_model,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        logits = model(**inputs).logits
+        eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask.split(), ["lawyer", "mechanic", "teacher", "doctor", "waiter"])
+
+        exported_program = torch.export.export(
+            model,
+            args=(inputs["input_ids"],),
+            kwargs={"attention_mask": inputs["attention_mask"]},
+            strict=True,
+        )
+
+        result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"])
+        ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices)
+        self.assertEqual(eg_predicted_mask, ep_predicted_mask)
diff --git a/tests/models/moonshine/__init__.py b/tests/models/moonshine/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py
new file mode 100644
index 00000000000000..99bb3dec21fd52
--- /dev/null
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@@ -0,0 +1,620 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Moonshine model."""
+
+import copy
+import unittest
+
+from transformers import MoonshineConfig, is_torch_available
+from transformers.testing_utils import cleanup, require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoProcessor,
+        MoonshineForConditionalGeneration,
+        MoonshineModel,
+    )
+
+from datasets import load_dataset
+
+
+class MoonshineModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,  # need batch_size != num_hidden_layers
+        seq_length=1000,
+        is_training=False,
+        use_labels=False,
+        vocab_size=147,
+        hidden_size=8,
+        intermediate_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        encoder_hidden_act="gelu",
+        decoder_hidden_act="silu",
+        decoder_start_token_id=85,
+        bos_token_id=98,
+        eos_token_id=98,
+        pad_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.encoder_hidden_act = encoder_hidden_act
+        self.decoder_hidden_act = decoder_hidden_act
+        self.decoder_start_token_id = decoder_start_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        decoder_input_ids = torch.tensor(self.batch_size * [[self.decoder_start_token_id]], device=torch_device)
+        decoder_attention_mask = decoder_input_ids.ne(self.pad_token_id)
+
+        config = self.get_config()
+
+        return config, input_values, attention_mask, decoder_input_ids, decoder_attention_mask
+
+    def get_config(self):
+        return MoonshineConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            encoder_num_hidden_layers=self.num_hidden_layers,
+            decoder_num_hidden_layers=self.num_hidden_layers,
+            encoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_attention_heads=self.num_attention_heads,
+            encoder_num_key_value_heads=self.num_key_value_heads,
+            decoder_num_key_value_heads=self.num_key_value_heads,
+            encoder_hidden_act=self.encoder_hidden_act,
+            decoder_hidden_act=self.decoder_hidden_act,
+            decoder_start_token_id=self.decoder_start_token_id,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = MoonshineModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = MoonshineModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_output_attentions(self, config, input_values, attention_mask):
+        model = MoonshineModel(config=config)
+        model.config.layerdrop = 1.0
+        model.to(torch_device)
+        model.train()
+
+        outputs = model(input_values, attention_mask=attention_mask, output_attentions=True)
+        self.parent.assertTrue(len(outputs.attentions) > 0)
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask, decoder_input_ids, decoder_attention_mask = (
+            self.prepare_config_and_inputs()
+        )
+        inputs_dict = {
+            "input_values": input_values,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class MoonshineModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MoonshineModel, MoonshineForConditionalGeneration) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "automatic-speech-recognition": MoonshineForConditionalGeneration,
+            "feature-extraction": MoonshineModel,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_pruning = False
+    test_headmasking = False
+
+    def setUp(self):
+        self.model_tester = MoonshineModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MoonshineConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", 1)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
+            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 5
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    subsampled_encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+
+    # Copied from tests.models.whisper.test_modeling_whisper.WhisperModelTest.test_hidden_states_output
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [subsampled_seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Copied from tests.models.whisper.test_modeling_whisper.WhisperModelTest.test_inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            decoder_input_ids = inputs.pop("decoder_input_ids", None)
+            inputs.pop("decoder_attention_mask", None)
+
+            wte = model.get_input_embeddings()
+            inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    # Copied from tests.models.whisper.test_modeling_whisper.WhisperModelTest.test_resize_tokens_embeddings
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            self.skipTest(reason="test_resize_embeddings is False")
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # make sure that decoder_input_ids are resized
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    # Copied from tests.models.whisper.test_modeling_whisper.WhisperModelTest.test_resize_embeddings_untied
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            self.skipTest(reason="test_resize_embeddings is False")
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            self.skipTest(reason="Model cannot untie embeddings")
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class MoonshineModelIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        self.processor_tiny = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
+        self.processor_base = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base")
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    @slow
+    def test_tiny_logits_single(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+        model.to(torch_device)
+
+        inputs = self.processor_tiny(self._load_datasamples(1), return_tensors="pt")
+        inputs.to(torch_device)
+        outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor([
+            -9.1107,  4.5538,  6.3902, -6.8141, -7.2459, -7.9077, -7.2842, -7.6045, -8.0387, -7.8354,
+            -7.3870, -7.2453, -7.7423, -7.3914, -7.3869, -7.6982, -7.6422, -7.0507, -7.3982, -7.2486,
+            -8.0799, -7.3303, -7.3675, -6.8769, -7.6879, -7.2684, -6.9868, -6.7459, -7.6858, -7.3052,
+        ])
+        # fmt: on
+        self.assertTrue(torch.allclose(outputs.logits[0][0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_base_logits_single(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base")
+        model.to(torch_device)
+
+        inputs = self.processor_base(self._load_datasamples(1), return_tensors="pt")
+        inputs.to(torch_device)
+        outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor([
+            -6.7340,  1.9483,  5.2449, -8.0277, -7.9167, -7.8956, -7.9649, -7.9348, -8.1312, -8.0616,
+            -8.1070, -7.7696, -7.8809, -7.9451, -8.1013, -7.8177, -7.8598, -7.8257, -7.8729, -7.9657,
+            -7.9310, -8.1024, -7.8698, -7.8231, -8.0752, -7.9764, -7.8127, -8.0536, -7.9492, -7.9289,
+        ])
+        # fmt: on
+        self.assertTrue(torch.allclose(outputs.logits[0][0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_tiny_logits_batch(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+        model.to(torch_device)
+
+        inputs = self.processor_tiny(self._load_datasamples(4), return_tensors="pt", padding=True)
+        inputs.to(torch_device)
+        outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor([
+            [-8.0098,  5.0239,  4.5986, -6.8125, -7.1676, -7.8782, -7.2152, -7.5188, -7.9078, -7.7394],
+            [-4.4394, -1.4429,  6.6715, -6.8927, -7.3748, -7.0967, -6.5255, -7.0255, -7.2583, -7.0007],
+            [-10.0088, 3.2862,  0.7342, -6.5558, -6.8514, -6.5309, -6.4173, -6.9485, -6.6215, -6.6230],
+            [-10.8083, 4.0034, -0.0635, -5.0501, -5.3903, -5.4587, -5.2416, -5.4742, -5.2662, -5.3154]
+        ])
+        # fmt: on
+        self.assertTrue(torch.allclose(outputs.logits[0][:, :10].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_base_logits_batch(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base")
+        model.to(torch_device)
+
+        inputs = self.processor_base(self._load_datasamples(4), return_tensors="pt", padding=True)
+        inputs.to(torch_device)
+        outputs = model.generate(**inputs, max_new_tokens=1, return_dict_in_generate=True, output_logits=True)
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor([
+            [-7.7288,  1.4636,  5.2273, -7.7310, -7.6249, -7.6009, -7.6786, -7.6438, -7.8450, -7.7546],
+            [-6.2161, -0.5891,  7.9489, -7.0693, -6.9996, -6.9980, -7.0952, -7.0830, -7.1685, -7.0136],
+            [-7.3186,  3.1192,  3.8938, -5.7208, -5.8429, -5.7610, -5.9997, -5.8213, -5.8616, -5.8720],
+            [-9.5488,  1.0147,  4.1174, -5.9972, -6.0616, -6.0331, -6.2105, -6.0320, -6.0791, -6.0875]
+        ])
+
+        # fmt: on
+        self.assertTrue(torch.allclose(outputs.logits[0][:, :10].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_tiny_generation_single(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+        model.to(torch_device)
+
+        audio_array = self._load_datasamples(1)
+        inputs = self.processor_tiny(audio_array, return_tensors="pt")
+        inputs.to(torch_device)
+        generated_ids = model.generate(**inputs, max_new_tokens=20)
+        transcript = self.processor_tiny.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_base_generation_single(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base")
+        model.to(torch_device)
+
+        audio_array = self._load_datasamples(1)
+        inputs = self.processor_base(audio_array, return_tensors="pt")
+        inputs.to(torch_device)
+        generated_ids = model.generate(**inputs, max_new_tokens=20)
+        transcript = self.processor_base.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_tiny_generation_batch(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")
+        model.to(torch_device)
+
+        audio_array = self._load_datasamples(4)
+        inputs = self.processor_tiny(audio_array, return_tensors="pt", padding=True)
+        inputs.to(torch_device)
+        generated_ids = model.generate(**inputs, max_new_tokens=20)
+        transcript = self.processor_tiny.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome",
+            "Nor is Mr. Quilter's manner less interesting than his matter.",
+            "He tells us that at this festive season of the year, with Christmas and Rose beef lo",
+            "He has grave doubts whether Sir Frederick Layton's work is really Greek after all,",
+        ]
+        # fmt: on
+
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_base_generation_batch(self):
+        model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-base")
+        model.to(torch_device)
+
+        audio_array = self._load_datasamples(4)
+        inputs = self.processor_base(audio_array, return_tensors="pt", padding=True)
+        inputs.to(torch_device)
+        generated_ids = model.generate(**inputs, max_new_tokens=20)
+        transcript = self.processor_base.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome",
+            "Nor is Mr. Quilter's manner less interesting than his matter.",
+            "He tells us that at this festive season of the year, with Christmas and rose beef lo",
+            "He has grave doubts whether Sir Frederick Layton's work is really Greek after all,",
+        ]
+        # fmt: on
+
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index fd62c74d3d6e11..249706c1c4705f 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -26,6 +26,7 @@
     require_flash_attn,
     require_read_token,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_sdpa,
     slow,
@@ -103,7 +104,7 @@ def test_model_outputs_equivalence(self, **kwargs):
         pass
 
     @require_torch_sdpa
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_sdpa_equivalence(self):
         for model_class in self.all_model_classes:
diff --git a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
index ed85c4c0007829..75c0e6f1c78d58 100644
--- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
+++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py
@@ -26,7 +26,7 @@
 from transformers.testing_utils import (
     require_timm,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     require_vision,
     slow,
     torch_device,
@@ -865,7 +865,7 @@ def test_inference_object_detection_head_batched(self):
         ]
         self.assertListEqual([result["classes"] for result in results], expected_classes)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         processor = self.default_processor
         image = prepare_img()
@@ -878,8 +878,8 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
             cpu_outputs = model(**encoding)
 
         # 2. run model on GPU
-        model.to("cuda")
-        encoding = encoding.to("cuda")
+        model.to(torch_device)
+        encoding = encoding.to(torch_device)
         with torch.no_grad():
             gpu_outputs = model(**encoding)
 
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index f973e1211dc081..587d4606493b9b 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -348,7 +348,7 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
 
     # TODO (joao, raushan): fix me -- the problem is in `cache_position[0] == 0`, i.e. dynamic control flow
     @unittest.skip("PaliGemma is not compatible with end-to-end generation compilation")
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile_model_forward(self):
         pass
 
 
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index 2c5557dfd67aae..6ec663c6636fa6 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -459,6 +459,9 @@ def test_model_rope_scaling_short_long_factor(self, scaling_type):
             "long_factor": [5.0 for _ in range(n_factors)],
         }
         input_tensor = ids_tensor([1, 4090], config.vocab_size)
+        # Make sure we don't have padding tokens. If this is the case, then the actual number of "true" tokens may be shorter
+        # than `config.original_max_position_embeddings + 5`, invalidating this test
+        input_tensor[input_tensor == config.pad_token_id] += 1
         model = Phi3ForCausalLM(config)
         model.to(torch_device)
         model.eval()
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index c3496dff3cdf81..d224c531241fa7 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -253,7 +253,7 @@ def test_processor_returns_full_length_batches(self):
             "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
         ] * 5
         processor.tokenizer.pad_token = "</s>"
-        image_inputs = [self.image_0] * 5
+        image_inputs = [[self.image_0]] * 5
 
         # Make small for checking image token expansion
         processor.image_processor.size = {"longest_edge": 30}
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index 8974d6923b391c..ef8def3caef29c 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -49,7 +49,7 @@ def __init__(
         parent,
         ignore_index=-100,
         audio_token_index=0,
-        seq_length=7,
+        seq_length=25,
         feat_seq_length=60,
         text_config={
             "model_type": "qwen2",
@@ -91,7 +91,7 @@ def __init__(
         self.is_training = is_training
 
         self.batch_size = 3
-        self.encoder_seq_length = audio_config["max_source_positions"] // 2 + seq_length - 1
+        self.encoder_seq_length = seq_length
 
     def get_config(self):
         return Qwen2AudioConfig(
@@ -116,11 +116,13 @@ def prepare_config_and_inputs(self):
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_features_values, feature_attention_mask = config_and_inputs
+        input_length = (input_features_values.shape[-1] - 1) // 2 + 1
+        num_audio_tokens = (input_length - 2) // 2 + 1
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
         attention_mask[:, :1] = 0
         # we are giving 3 audios let's make sure we pass in 3 audios tokens
-        input_ids[:, 1] = config.audio_token_index
+        input_ids[:, 1 : 1 + num_audio_tokens] = config.audio_token_index
         inputs_dict = {
             "input_features": input_features_values,
             "feature_attention_mask": feature_attention_mask,
@@ -237,54 +239,39 @@ def test_small_model_integration_test_single(self):
 
         output = model.generate(**inputs, max_new_tokens=32)
 
-        EXPECTED_INPUT_IDS = torch.tensor(
-            [
-                [
-                    151644,
-                    8948,
-                    198,
-                    2610,
-                    525,
-                    264,
-                    10950,
-                    17847,
-                    13,
-                    151645,
-                    198,
-                    151644,
-                    872,
-                    198,
-                    14755,
-                    220,
-                    16,
-                    25,
-                    220,
-                    151647,
-                    151646,
-                    151648,
-                    198,
-                    3838,
-                    594,
-                    429,
-                    5112,
-                    30,
-                    151645,
-                    198,
-                    151644,
-                    77091,
-                    198,
-                ]
-            ]
-        )
+        # fmt: off
+        EXPECTED_INPUT_IDS = torch.tensor([[
+            151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 14755, 220, 16, 25, 220, 151647,
+            *[151646] * 101,
+            151648, 198, 3838, 594, 429, 5112, 30, 151645, 198, 151644, 77091, 198,
+        ]])
+        # fmt: on
         self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
 
-        EXPECTED_DECODED_TEXT = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat's that sound?<|im_end|>\n<|im_start|>assistant\nIt is the sound of glass breaking.<|im_end|>"
+        EXPECTED_DECODED_TEXT = (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|>"
+            + "<|AUDIO|>" * 101
+            + "<|audio_eos|>\nWhat's that sound?<|im_end|>\n<|im_start|>assistant\nIt is the sound of glass breaking.<|im_end|>"
+        )
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=False),
             EXPECTED_DECODED_TEXT,
         )
 
+        # test the error when incorrect number of audio tokens
+        # fmt: off
+        inputs["input_ids"] = torch.tensor([[
+            151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 14755, 220, 16, 25, 220, 151647,
+            *[151646] * 200,
+            151648, 198, 3838, 594, 429, 5112, 30, 151645, 198, 151644, 77091, 198,
+        ]])
+        # fmt: on
+        with self.assertRaisesRegex(
+            ValueError, "Audio features and audio tokens do not match: tokens: 200, features 101"
+        ):
+            model.generate(**inputs, max_new_tokens=32)
+
     @slow
     def test_small_model_integration_test_batch(self):
         # Let' s make sure we test the preprocessing to replace what is used
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index a6004349b49d11..76220dc66e960a 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
 
 
 if is_torch_available():
@@ -40,6 +40,7 @@ def __init__(
         parent,
         batch_size=7,
         num_channels=3,
+        num_frames=10,
         min_resolution=56,
         max_resolution=1024,
         min_pixels=56 * 56,
@@ -58,6 +59,7 @@ def __init__(
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
         self.num_channels = num_channels
+        self.num_frames = num_frames
         self.image_mean = OPENAI_CLIP_MEAN
         self.image_std = OPENAI_CLIP_STD
         self.min_pixels = min_pixels
@@ -95,6 +97,18 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
         )
         return [[image] for image in images]
 
+    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            num_frames=self.num_frames,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
 
 @require_torch
 @require_vision
@@ -247,3 +261,26 @@ def test_nested_input(self):
         # Image processor should return same pixel values, independently of ipnut format
         self.assertTrue((encoded_images_nested == encoded_images).all())
         self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
+
+    def test_video_inputs(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
+
+        for num_frames, expected_dims in expected_dims_by_frames.items():
+            image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
+            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
+            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+            encoded_video = prcocess_out.pixel_values_videos
+            expected_output_video_shape = (expected_dims, 1176)
+            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
+
+    def test_custom_patch_size(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        for patch_size in (1, 3, 5, 7):
+            image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
+            video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
+            prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
+            encoded_video = prcocess_out.pixel_values_videos
+            expected_output_video_shape = (171500, 1176)
+            self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 2c27e1a03a647c..fc9adcebf5f7ad 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -227,6 +227,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
     pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
     test_pruning = False
     test_head_masking = False
+    _is_composite = True
 
     def setUp(self):
         self.model_tester = Qwen2VLVisionText2TextModelTester(self)
@@ -253,7 +254,7 @@ def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
         when number of images don't match number of image tokens in the text.
-        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        Also we need to test multi-image cases when one prompt has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -332,7 +333,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
     @unittest.skip(reason="Can't compile fullgraph due to dynamic control flow in `prepare_inputs_for_generate`")
-    def test_generate_compile_fullgraph(self):
+    def test_generate_compile_model_forward(self):
         pass
 
 
diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py
index 65a417fe56f618..368e2dd140f38c 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr.py
@@ -28,7 +28,13 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -631,7 +637,7 @@ def test_initialization(self):
         self.assertTrue(not failed_cases, message)
 
     @parameterized.expand(["float32", "float16", "bfloat16"])
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_inference_with_different_dtypes(self, torch_dtype_str):
         torch_dtype = {
@@ -653,7 +659,7 @@ def test_inference_with_different_dtypes(self, torch_dtype_str):
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
     @parameterized.expand(["float32", "float16", "bfloat16"])
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_inference_equivalence_for_static_and_dynamic_anchors(self, torch_dtype_str):
         torch_dtype = {
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index df743f132c1140..d6993469e043aa 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -23,6 +23,7 @@
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     slow,
     torch_device,
@@ -412,7 +413,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class Starcoder2IntegrationTest(unittest.TestCase):
     def test_starcoder2_batched_generation_sdpa(self):
         EXPECTED_TEXT = [
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 7adb1f40c6e696..32597f8ce2869e 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -576,6 +576,8 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
     test_torchscript = False
     # The small SWITCH_TRANSFORMERS model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
+    # `SwitchTransformers` is a MOE in which not all experts will get gradients because they are not all used in a single forward pass
+    test_all_params_have_gradient = False
 
     def setUp(self):
         self.model_tester = SwitchTransformersModelTester(self)
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index b03416390766d0..52fec78d1e8938 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -27,7 +27,7 @@
     require_sentencepiece,
     require_tokenizers,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -1646,7 +1646,7 @@ def test_contrastive_search_t5(self):
         )
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_compile_static_cache(self):
         NUM_TOKENS_TO_GENERATE = 40
         EXPECTED_TEXT_COMPLETION = [
@@ -1686,7 +1686,7 @@ def test_compile_static_cache(self):
         self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_compile_static_cache_encoder(self):
         prompts = [
             "summarize: Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
diff --git a/tests/models/textnet/__init__.py b/tests/models/textnet/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/textnet/test_image_processing_textnet.py b/tests/models/textnet/test_image_processing_textnet.py
new file mode 100644
index 00000000000000..4fcd93e872fcdf
--- /dev/null
+++ b/tests/models/textnet/test_image_processing_textnet.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import TextNetImageProcessor
+
+
+class TextNetImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        size_divisor=32,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.size_divisor = size_divisor
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "size_divisor": self.size_divisor,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class TextNetImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = TextNetImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = TextNetImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 20})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
new file mode 100644
index 00000000000000..cf5e48506e52a4
--- /dev/null
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch TextNet model."""
+
+import unittest
+
+import requests
+from PIL import Image
+
+from transformers import TextNetConfig
+from transformers.models.textnet.image_processing_textnet import TextNetImageProcessor
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import TextNetBackbone, TextNetForImageClassification, TextNetModel
+
+
+class TextNetConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
+class TextNetModelTester:
+    def __init__(
+        self,
+        parent,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_in_channels=3,
+        stem_out_channels=32,
+        stem_act_func="relu",
+        dropout_rate=0,
+        ops_order="weight_bn_act",
+        conv_layer_kernel_sizes=[
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
+            [[3, 3]],
+        ],
+        conv_layer_strides=[
+            [2],
+            [2],
+            [2],
+            [2],
+        ],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
+        out_indices=[1, 2, 3, 4],
+        batch_size=3,
+        num_channels=3,
+        image_size=[32, 32],
+        is_training=True,
+        use_labels=True,
+        num_labels=3,
+        hidden_sizes=[32, 32, 32, 32, 32],
+    ):
+        self.parent = parent
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_in_channels = stem_in_channels
+        self.stem_out_channels = stem_out_channels
+        self.act_func = stem_act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
+
+        self.out_features = out_features
+        self.out_indices = out_indices
+
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.hidden_sizes = hidden_sizes
+
+        self.num_stages = 5
+
+    def get_config(self):
+        return TextNetConfig(
+            stem_kernel_size=self.stem_kernel_size,
+            stem_stride=self.stem_stride,
+            stem_num_channels=self.stem_in_channels,
+            stem_out_channels=self.stem_out_channels,
+            act_func=self.act_func,
+            dropout_rate=self.dropout_rate,
+            ops_order=self.ops_order,
+            conv_layer_kernel_sizes=self.conv_layer_kernel_sizes,
+            conv_layer_strides=self.conv_layer_strides,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            hidden_sizes=self.hidden_sizes,
+            image_size=self.image_size,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TextNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.hidden_sizes[-1], scale_h, scale_w),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = TextNetForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = TextNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 8 * scale_h, 8 * scale_w]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = TextNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        scale_h = self.image_size[0] // 32
+        scale_w = self.image_size[1] // 32
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[0], scale_h, scale_w]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some tests of test_modeling_common.py, as TextNet does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TextNetModel, TextNetForImageClassification, TextNetBackbone) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TextNetModel, "image-classification": TextNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TextNetModelTester(self)
+        self.config_tester = TextNetConfigTester(self, config_class=TextNetConfig, has_text_modality=False)
+
+    @unittest.skip(reason="TextNet does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="TextNet does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="TextNet does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="TextNet does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, module in model.named_modules():
+                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+                    self.assertTrue(
+                        torch.all(module.weight == 1),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+                    self.assertTrue(
+                        torch.all(module.bias == 0),
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            self.assertEqual(len(hidden_states), self.model_tester.num_stages)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size[0] // 2, self.model_tester.image_size[1] // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        layers_type = ["preactivation", "bottleneck"]
+        for model_class in self.all_model_classes:
+            for layer_type in layers_type:
+                config.layer_type = layer_type
+                inputs_dict["output_hidden_states"] = True
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+                # check that output_hidden_states also work using config
+                del inputs_dict["output_hidden_states"]
+                config.output_hidden_states = True
+
+                check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip(reason="TextNet does not use feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "czczup/textnet-base"
+        model = TextNetModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+@require_torch
+@require_vision
+class TextNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
+        model = TextNetModel.from_pretrained("czczup/textnet-base").to(torch_device)
+
+        # prepare image
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # verify logits
+        self.assertEqual(output.logits.shape, torch.Size([1, 2]))
+        expected_slice_backbone = torch.tensor(
+            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output.feature_maps[-1][0][10][12][:10], expected_slice_backbone, atol=1e-3))
+
+
+@require_torch
+# Copied from tests.models.bit.test_modeling_bit.BitBackboneTest with Bit->TextNet
+class TextNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (TextNetBackbone,) if is_torch_available() else ()
+    config_class = TextNetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TextNetModelTester(self)
diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
index 6f63c0aa147d09..360bbc9371770b 100644
--- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
+++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
@@ -17,6 +17,7 @@
 import tempfile
 import unittest
 
+from transformers import pipeline
 from transformers.testing_utils import (
     require_bitsandbytes,
     require_timm,
@@ -200,6 +201,54 @@ def test_do_pooling_option(self):
             output = model(**inputs_dict, do_pooling=True)
         self.assertIsNotNone(output.pooler_output)
 
+    def test_timm_config_labels(self):
+        # test timm config with no labels
+        checkpoint = "timm/resnet18.a1_in1k"
+        config = TimmWrapperConfig.from_pretrained(checkpoint)
+        self.assertIsNone(config.label2id)
+        self.assertIsInstance(config.id2label, dict)
+        self.assertEqual(len(config.id2label), 1000)
+        self.assertEqual(config.id2label[1], "goldfish, Carassius auratus")
+
+        # test timm config with labels in config
+        checkpoint = "timm/eva02_large_patch14_clip_336.merged2b_ft_inat21"
+        config = TimmWrapperConfig.from_pretrained(checkpoint)
+
+        self.assertIsInstance(config.id2label, dict)
+        self.assertEqual(len(config.id2label), 10000)
+        self.assertEqual(config.id2label[1], "Sabella spallanzanii")
+
+        self.assertIsInstance(config.label2id, dict)
+        self.assertEqual(len(config.label2id), 10000)
+        self.assertEqual(config.label2id["Sabella spallanzanii"], 1)
+
+        # test custom labels are provided
+        checkpoint = "timm/resnet18.a1_in1k"
+        config = TimmWrapperConfig.from_pretrained(checkpoint, num_labels=2)
+        self.assertEqual(config.num_labels, 2)
+        self.assertEqual(config.id2label, {0: "LABEL_0", 1: "LABEL_1"})
+        self.assertEqual(config.label2id, {"LABEL_0": 0, "LABEL_1": 1})
+
+        # test with provided id2label and label2id
+        checkpoint = "timm/resnet18.a1_in1k"
+        config = TimmWrapperConfig.from_pretrained(
+            checkpoint, num_labels=2, id2label={0: "LABEL_0", 1: "LABEL_1"}, label2id={"LABEL_0": 0, "LABEL_1": 1}
+        )
+        self.assertEqual(config.num_labels, 2)
+        self.assertEqual(config.id2label, {0: "LABEL_0", 1: "LABEL_1"})
+        self.assertEqual(config.label2id, {"LABEL_0": 0, "LABEL_1": 1})
+
+        # test save load
+        checkpoint = "timm/resnet18.a1_in1k"
+        config = TimmWrapperConfig.from_pretrained(checkpoint)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config.save_pretrained(tmpdirname)
+            restored_config = TimmWrapperConfig.from_pretrained(tmpdirname)
+
+        self.assertEqual(config.num_labels, restored_config.num_labels)
+        self.assertEqual(config.id2label, restored_config.id2label)
+        self.assertEqual(config.label2id, restored_config.label2id)
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
@@ -246,6 +295,19 @@ def test_inference_image_classification_head(self):
         is_close = torch.allclose(resulted_slice, expected_slice, atol=1e-3)
         self.assertTrue(is_close, f"Expected {expected_slice}, but got {resulted_slice}")
 
+    @slow
+    def test_inference_with_pipeline(self):
+        image = prepare_img()
+        classifier = pipeline(model="timm/resnet18.a1_in1k", device=torch_device)
+        result = classifier(image)
+
+        # verify result
+        expected_label = "tabby, tabby cat"
+        expected_score = 0.4329
+
+        self.assertEqual(result[0]["label"], expected_label)
+        self.assertAlmostEqual(result[0]["score"], expected_score, places=3)
+
     @slow
     @require_bitsandbytes
     def test_inference_image_classification_quantized(self):
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 464061915e8b3d..43146a4779310d 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -82,7 +82,6 @@ def __init__(
         self.out_features = out_features
         self.num_labels = num_labels
         self.scope = scope
-        self.num_hidden_layers = num_stages
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 14b079665ab6d6..4a7bcb45b01098 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -127,7 +127,6 @@ def __init__(
         self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2
         self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames
         self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
-        self.encoder_seq_length = self.seq_length
 
     def get_config(self):
         return VideoLlavaConfig(
@@ -185,22 +184,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def prepare_config_and_inputs_for_batched_test(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, _, pixel_values_videos = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1).to(torch_device)
-
-        # make sure no other special tokens are set
-        input_ids[(input_ids == 0) | (input_ids == 1)] = 3
-        input_ids[:, 0] = config.video_token_index
-        inputs_dict = {
-            "pixel_values_videos": pixel_values_videos,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
 
 @require_torch
 class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -339,7 +322,7 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     ),
                 )
 
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_batched_test()
+        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             config.output_hidden_states = True
@@ -457,11 +440,11 @@ def test_small_model_integration_test(self):
             repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
         )
         video_file = np.load(video_file)
-        inputs = self.processor(prompt, videos=video_file, return_tensors="pt")
-
-        EXPECTED_INPUT_IDS = torch.tensor([[1,  3148, 1001, 29901, 29871, 32001, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901]])  # fmt: skip
+        inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device)
 
-        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+        EXPECTED_INPUT_IDS = torch.tensor([1,  3148, 1001, 29901, 29871, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901], device=torch_device)  # fmt: skip
+        non_video_inputs = inputs["input_ids"][inputs["input_ids"] != 32001]
+        self.assertTrue(torch.equal(non_video_inputs, EXPECTED_INPUT_IDS))
 
         output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
         EXPECTED_DECODED_TEXT = "USER: \nWhy is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and reading a book, which"  # fmt: skip
@@ -487,7 +470,9 @@ def test_small_model_integration_test_mixed_inputs(self):
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         image = Image.open(requests.get(url, stream=True).raw)
 
-        inputs = self.processor(prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt")
+        inputs = self.processor(prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt").to(
+            torch_device
+        )
         output = model.generate(**inputs, do_sample=False, max_new_tokens=20)
 
         EXPECTED_DECODED_TEXT = [
@@ -543,7 +528,7 @@ def test_small_model_integration_test_llama_batched(self):
             hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
         )
 
-        inputs = processor(prompts, videos=[video_1, video_2], return_tensors="pt", padding=True)
+        inputs = processor(prompts, videos=[video_1, video_2], return_tensors="pt", padding=True).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -583,96 +568,16 @@ def test_video_llava_merge_inputs_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
         model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", load_in_4bit=True)
 
-        # Simulate some user inputs
-        pixel_values_videos = torch.randn(
-            (1, 8, 3, 224, 224),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        # fmt: off
-        input_ids = torch.tensor(
-            [[32002, 32002, 1, 15043, 7084, 32001, 29871, 13, 7900]],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        # fmt: on
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
+        prompt = "USER: <video>\nDescribe the video:? ASSISTANT:"
+        video_file = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
         )
+        video_file = np.load(video_file)
+        inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that the loss is properly computed
         loss = model(
-            pixel_values_videos=pixel_values_videos,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
+            **inputs,
+            labels=inputs.input_ids.clone(),
         ).loss
         loss.backward()
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing_images(self):
-        model_id = "LanguageBind/Video-LLaVA-7B-hf"
-        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = VideoLlavaProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nDescribe the image in details. ASSISTANT:"
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 19)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "LanguageBind/Video-LLaVA-7B-hf"
-        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = VideoLlavaProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <video>\nDescribe the video in details. ASSISTANT:"
-        video_file = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
-        )
-        video_file = np.load(video_file)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 19)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 8286b3c94fb9da..a676defe3195f5 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -320,7 +320,7 @@ def test_small_model_integration_test(self):
 
         outputs = model.generate(**inputs, max_new_tokens=10)
 
-        EXPECTED_OUTPUT = "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
+        EXPECTED_OUTPUT = "USER:  \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
         self.assertEqual(processor.decode(outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
     @slow
@@ -329,63 +329,17 @@ def test_vipllava_merge_inputs_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
         model_id = "llava-hf/vip-llava-7b-hf"
         model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
 
-        # Simulate some user inputs
-        pixel_values = torch.randn(
-            (1, 3, 336, 336),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        input_ids = torch.tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
-        )
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+        prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
+
+        inputs = processor(prompt, image, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that the loss is properly computed
         loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
+            **inputs,
+            labels=inputs.input_ids.clone(),
         ).loss
         loss.backward()
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "llava-hf/vip-llava-7b-hf"
-        model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        processor.num_additional_image_tokens = 1
-        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        processor.num_additional_image_tokens = None
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 18)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/vitpose/__init__.py b/tests/models/vitpose/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
new file mode 100644
index 00000000000000..5edf27e6a69af7
--- /dev/null
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VitPoseImageProcessor
+
+
+class VitPoseImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_affine_transform=True,
+        size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_affine_transform = do_affine_transform
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_affine_transform": self.do_affine_transform,
+            "size": self.size,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class VitPoseImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = VitPoseImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = VitPoseImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_affine_transform"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
+
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size={"height": 42, "width": 42}
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processing(image_inputs[0], boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processing(image_inputs, boxes=boxes, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size * 2, *expected_output_image_shape)
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Test that can process images which have an arbitrary number of channels
+        # Initialize image_processing
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        # Test not batched input
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]]
+        encoded_images = image_processor(
+            image_inputs[0],
+            boxes=boxes,
+            return_tensors="pt",
+            input_data_format="channels_last",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (len(boxes[0]), *expected_output_image_shape))
+
+        # Test batched
+        boxes = [[[0, 0, 1, 1], [0.5, 0.5, 0.5, 0.5]]] * self.image_processor_tester.batch_size
+        encoded_images = image_processor(
+            image_inputs,
+            boxes=boxes,
+            return_tensors="pt",
+            input_data_format="channels_last",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape),
+            (self.image_processor_tester.batch_size * len(boxes[0]), *expected_output_image_shape),
+        )
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
new file mode 100644
index 00000000000000..1c33b6cf367131
--- /dev/null
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -0,0 +1,332 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch VitPose model."""
+
+import inspect
+import unittest
+
+import requests
+
+from transformers import VitPoseBackboneConfig, VitPoseConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import VitPoseForPoseEstimation
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VitPoseImageProcessor
+
+
+class VitPoseModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=[16 * 8, 12 * 8],
+        patch_size=[8, 8],
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=2,
+        scale_factor=4,
+        out_indices=[-1],
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scale_factor = scale_factor
+        self.out_indices = out_indices
+        self.scope = scope
+
+        # in VitPose, the seq length equals the number of patches
+        num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
+        self.seq_length = num_patches
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return VitPoseConfig(
+            backbone_config=self.get_backbone_config(),
+        )
+
+    def get_backbone_config(self):
+        return VitPoseBackboneConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            num_hidden_layers=self.num_hidden_layers,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            out_indices=self.out_indices,
+        )
+
+    def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
+        model = VitPoseForPoseEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        expected_height = (self.image_size[0] // self.patch_size[0]) * self.scale_factor
+        expected_width = (self.image_size[1] // self.patch_size[1]) * self.scale_factor
+
+        self.parent.assertEqual(
+            result.heatmaps.shape, (self.batch_size, self.num_labels, expected_height, expected_width)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VitPoseModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitPose does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VitPoseForPoseEstimation,) if is_torch_available() else ()
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VitPoseModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VitPoseConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support input and output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="VitPose does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_for_pose_estimation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "usyd-community/vitpose-base-simple"
+        model = VitPoseForPoseEstimation.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of people in house
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@require_torch
+@require_vision
+class VitPoseModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            VitPoseImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_pose_estimation(self):
+        image_processor = self.default_image_processor
+        model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
+        model.to(torch_device)
+        model.eval()
+
+        image = prepare_img()
+        boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+
+        inputs = image_processor(images=image, boxes=boxes, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        heatmaps = outputs.heatmaps
+
+        assert heatmaps.shape == (2, 17, 64, 48)
+
+        expected_slice = torch.tensor(
+            [
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+            ]
+        ).to(torch_device)
+
+        assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
+
+        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
+
+        expected_bbox = torch.tensor([391.9900, 190.0800, 391.1575, 189.3034])
+        expected_keypoints = torch.tensor(
+            [
+                [3.9813e02, 1.8184e02],
+                [3.9828e02, 1.7981e02],
+                [3.9596e02, 1.7948e02],
+            ]
+        )
+        expected_scores = torch.tensor([8.7529e-01, 8.4315e-01, 9.2678e-01])
+
+        self.assertEqual(len(pose_results), 2)
+        self.assertTrue(torch.allclose(pose_results[1]["bbox"].cpu(), expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[1]["keypoints"][:3].cpu(), expected_keypoints, atol=1e-2))
+        self.assertTrue(torch.allclose(pose_results[1]["scores"][:3].cpu(), expected_scores, atol=1e-4))
+
+    @slow
+    def test_batched_inference(self):
+        image_processor = self.default_image_processor
+        model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
+        model.to(torch_device)
+        model.eval()
+
+        image = prepare_img()
+        boxes = [
+            [[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]],
+            [[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]],
+        ]
+
+        inputs = image_processor(images=[image, image], boxes=boxes, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+        heatmaps = outputs.heatmaps
+
+        assert heatmaps.shape == (4, 17, 64, 48)
+
+        expected_slice = torch.tensor(
+            [
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+                [9.9330e-06, 9.9330e-06, 9.9330e-06],
+            ]
+        ).to(torch_device)
+
+        assert torch.allclose(heatmaps[0, 0, :3, :3], expected_slice, atol=1e-4)
+
+        pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
+        print(pose_results)
+
+        expected_bbox = torch.tensor([391.9900, 190.0800, 391.1575, 189.3034])
+        expected_keypoints = torch.tensor(
+            [
+                [3.9813e02, 1.8184e02],
+                [3.9828e02, 1.7981e02],
+                [3.9596e02, 1.7948e02],
+            ]
+        )
+        expected_scores = torch.tensor([8.7529e-01, 8.4315e-01, 9.2678e-01])
+
+        self.assertEqual(len(pose_results), 2)
+        self.assertEqual(len(pose_results[0]), 2)
+        self.assertTrue(torch.allclose(pose_results[0][1]["bbox"].cpu(), expected_bbox, atol=1e-4))
+        self.assertTrue(torch.allclose(pose_results[0][1]["keypoints"][:3].cpu(), expected_keypoints, atol=1e-2))
+        self.assertTrue(torch.allclose(pose_results[0][1]["scores"][:3].cpu(), expected_scores, atol=1e-4))
diff --git a/tests/models/vitpose_backbone/__init__.py b/tests/models/vitpose_backbone/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
new file mode 100644
index 00000000000000..c32a57e9f2987f
--- /dev/null
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch VitPose backbone model."""
+
+import inspect
+import unittest
+
+from transformers import VitPoseBackboneConfig
+from transformers.testing_utils import require_torch
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    from transformers import VitPoseBackbone
+
+
+if is_vision_available():
+    pass
+
+
+class VitPoseBackboneModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=[16 * 8, 12 * 8],
+        patch_size=[8, 8],
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+        # in VitPoseBackbone, the seq length equals the number of patches
+        num_patches = (image_size[0] // patch_size[0]) * (image_size[1] // patch_size[1])
+        self.seq_length = num_patches
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return VitPoseBackboneConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            num_labels=self.num_labels,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as VitPoseBackbone does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VitPoseBackboneModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=VitPoseBackboneConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support input and output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not output a loss")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="VitPoseBackbone does not support training yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+
+@require_torch
+class VitPoseBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()
+    config_class = VitPoseBackboneConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = VitPoseBackboneModelTester(self)
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
index a6dd516f98a412..ee47f98a1f4133 100644
--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -46,7 +46,7 @@
         ZambaModel,
     )
     from transformers.models.zamba.modeling_zamba import (
-        HybridMambaAttentionDynamicCache,
+        ZambaHybridDynamicCache,
     )
 
 
@@ -215,9 +215,7 @@ def create_and_check_decoder_model_past_large_inputs(
 
         # first forward pass
         # Attention: Zamba needs the cache to be initialized to return a cache!
-        past_key_values = HybridMambaAttentionDynamicCache(
-            config, input_ids.shape[0], model.dtype, device=model.device
-        )
+        past_key_values = ZambaHybridDynamicCache(config, input_ids.shape[0], model.dtype, device=model.device)
         outputs = model(
             input_ids,
             attention_mask=input_mask,
diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py
index bdbccee5ad364c..6d6330d3d4f6ba 100644
--- a/tests/peft_integration/test_peft_integration.py
+++ b/tests/peft_integration/test_peft_integration.py
@@ -350,7 +350,6 @@ def test_peft_add_multi_adapter(self):
                 self.assertFalse(
                     torch.allclose(logits_adapter_1.logits, logits_adapter_mixed.logits, atol=1e-6, rtol=1e-6)
                 )
-
                 self.assertFalse(
                     torch.allclose(logits_adapter_2.logits, logits_adapter_mixed.logits, atol=1e-6, rtol=1e-6)
                 )
@@ -359,6 +358,70 @@ def test_peft_add_multi_adapter(self):
                 with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
 
+    def test_delete_adapter(self):
+        """
+        Enhanced test for `delete_adapter` to handle multiple adapters,
+        edge cases, and proper error handling.
+        """
+        from peft import LoraConfig
+
+        for model_id in self.transformers_test_model_ids:
+            for transformers_class in self.transformers_test_model_classes:
+                model = transformers_class.from_pretrained(model_id).to(torch_device)
+
+                # Add multiple adapters
+                peft_config_1 = LoraConfig(init_lora_weights=False)
+                peft_config_2 = LoraConfig(init_lora_weights=False)
+                model.add_adapter(peft_config_1, adapter_name="adapter_1")
+                model.add_adapter(peft_config_2, adapter_name="adapter_2")
+
+                # Ensure adapters were added
+                self.assertIn("adapter_1", model.peft_config)
+                self.assertIn("adapter_2", model.peft_config)
+
+                # Delete a single adapter
+                model.delete_adapter("adapter_1")
+                self.assertNotIn("adapter_1", model.peft_config)
+                self.assertIn("adapter_2", model.peft_config)
+
+                # Delete remaining adapter
+                model.delete_adapter("adapter_2")
+                self.assertNotIn("adapter_2", model.peft_config)
+                self.assertFalse(model._hf_peft_config_loaded)
+
+                # Re-add adapters for edge case tests
+                model.add_adapter(peft_config_1, adapter_name="adapter_1")
+                model.add_adapter(peft_config_2, adapter_name="adapter_2")
+
+                # Attempt to delete multiple adapters at once
+                model.delete_adapter(["adapter_1", "adapter_2"])
+                self.assertNotIn("adapter_1", model.peft_config)
+                self.assertNotIn("adapter_2", model.peft_config)
+                self.assertFalse(model._hf_peft_config_loaded)
+
+                # Test edge cases
+                with self.assertRaisesRegex(ValueError, "The following adapter\\(s\\) are not present"):
+                    model.delete_adapter("nonexistent_adapter")
+
+                with self.assertRaisesRegex(ValueError, "The following adapter\\(s\\) are not present"):
+                    model.delete_adapter(["adapter_1", "nonexistent_adapter"])
+
+                # Deleting with an empty list or None should not raise errors
+                model.add_adapter(peft_config_1, adapter_name="adapter_1")
+                model.add_adapter(peft_config_2, adapter_name="adapter_2")
+                model.delete_adapter([])  # No-op
+                self.assertIn("adapter_1", model.peft_config)
+                self.assertIn("adapter_2", model.peft_config)
+
+                model.delete_adapter(None)  # No-op
+                self.assertIn("adapter_1", model.peft_config)
+                self.assertIn("adapter_2", model.peft_config)
+
+                # Deleting duplicate adapter names in the list
+                model.delete_adapter(["adapter_1", "adapter_1"])
+                self.assertNotIn("adapter_1", model.peft_config)
+                self.assertIn("adapter_2", model.peft_config)
+
     @require_torch_gpu
     @require_bitsandbytes
     def test_peft_from_pretrained_kwargs(self):
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index e8cd8febca006e..da57a002c4f5bc 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1933,6 +1933,20 @@ def test_slow_unfinished_sequence(self):
             },
         )
 
+    @require_torch
+    def test_pipeline_assisted_generation(self):
+        """Tests that we can run assisted generation in the pipeline"""
+        model = "openai/whisper-tiny"
+        pipe = pipeline("automatic-speech-recognition", model=model, assistant_model=model)
+
+        # We can run the pipeline
+        prompt = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")["audio"]
+        _ = pipe(prompt)
+
+        # It is running assisted generation under the hood (e.g. flags incompatible with assisted gen will crash)
+        with self.assertRaises(ValueError):
+            _ = pipe(prompt, generate_kwargs={"num_beams": 2})
+
 
 def require_ffmpeg(test_case):
     """
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 51f3cae5e31235..7504ae009d0517 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -28,7 +28,6 @@
     require_tf,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_or_tf,
     torch_device,
 )
@@ -292,6 +291,50 @@ def __getitem__(self, i):
                 ],
             )
 
+    @require_torch
+    def test_small_chat_model_with_iterator_pt(self):
+        from transformers.pipelines.pt_utils import PipelineIterator
+
+        text_generator = pipeline(
+            task="text-generation", model="hf-internal-testing/tiny-gpt2-with-chatml-template", framework="pt"
+        )
+
+        # Using `do_sample=False` to force deterministic output
+        chat1 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        chat2 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a second test"},
+        ]
+        expected_chat1 = chat1 + [
+            {
+                "role": "assistant",
+                "content": " factors factors factors factors factors factors factors factors factors factors",
+            }
+        ]
+        expected_chat2 = chat2 + [
+            {
+                "role": "assistant",
+                "content": " stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs",
+            }
+        ]
+
+        def data():
+            yield from [chat1, chat2]
+
+        outputs = text_generator(data(), do_sample=False, max_new_tokens=10)
+        assert isinstance(outputs, PipelineIterator)
+        outputs = list(outputs)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": expected_chat1}],
+                [{"generated_text": expected_chat2}],
+            ],
+        )
+
     @require_tf
     def test_small_model_tf(self):
         text_generator = pipeline(task="text-generation", model="sshleifer/tiny-ctrl", framework="tf")
@@ -509,7 +552,7 @@ def run_pipeline_test(self, text_generator, _):
 
     @require_torch
     @require_accelerate
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_small_model_pt_bloom_accelerate(self):
         import torch
 
@@ -609,3 +652,17 @@ def test_pipeline_length_setting_warning(self):
         with CaptureLogger(logger) as cl:
             _ = text_generator(prompt, max_length=10)
         self.assertNotIn(logger_msg, cl.out)
+
+    @require_torch
+    def test_pipeline_assisted_generation(self):
+        """Tests that we can run assisted generation in the pipeline"""
+        model = "hf-internal-testing/tiny-random-MistralForCausalLM"
+        pipe = pipeline("text-generation", model=model, assistant_model=model)
+
+        # We can run the pipeline
+        prompt = "Hello world"
+        _ = pipe(prompt)
+
+        # It is running assisted generation under the hood (e.g. flags incompatible with assisted gen will crash)
+        with self.assertRaises(ValueError):
+            _ = pipe(prompt, generate_kwargs={"num_beams": 2})
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index 1a9c126f3d0445..1cd9c2d4a83e39 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -18,6 +18,7 @@
 from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 from transformers.testing_utils import (
     require_gguf,
+    require_read_token,
     require_torch_gpu,
     slow,
     torch_device,
@@ -630,9 +631,9 @@ def test_falcon7b_q2_k(self):
         )
 
         text = tokenizer(self.example_text, return_tensors="pt")["input_ids"].to(torch_device)
-        out = model.generate(text, max_new_tokens=10)
+        out = model.generate(text, max_new_tokens=16)
 
-        EXPECTED_TEXT = "Hello All,\nI am new to this forum."
+        EXPECTED_TEXT = 'Hello,\nI am trying to use the "get_post_meta"'
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
     def test_falcon7b_weights_conversion_fp16(self):
@@ -880,6 +881,7 @@ def test_gemma2_fp32(self):
         EXPECTED_TEXT = "Hello! 👋\n\nI'm a large language model"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
+    @require_read_token
     def test_gemma2_weights_conversion_fp32(self):
         original_model = AutoModelForCausalLM.from_pretrained(
             self.original_gemma2_model_id,
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
index b1be9ac8c682c0..c0056b23866338 100644
--- a/tests/quantization/gptq/test_gptq.py
+++ b/tests/quantization/gptq/test_gptq.py
@@ -18,16 +18,17 @@
 
 import pytest
 
-from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 from transformers.testing_utils import (
     is_torch_available,
     require_accelerate,
-    require_auto_gptq,
+    require_gptq,
     require_optimum,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
 )
+from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available
 
 
 if is_torch_available():
@@ -76,25 +77,29 @@ def test_optimum_config(self):
 
 @slow
 @require_optimum
-@require_auto_gptq
-@require_torch_gpu
+@require_gptq
 class GPTQTest(unittest.TestCase):
     model_name = "bigscience/bloom-560m"
 
     input_text = "Hello my name is"
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
     EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
     EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
     EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
+    EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
+    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
+    EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
 
     # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
     EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
 
     bits = 4
+    sym = True
     group_size = 128
     desc_act = False
     use_exllama = False
@@ -103,7 +108,7 @@ class GPTQTest(unittest.TestCase):
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
     ]
 
-    device_map = None
+    device_map = "cpu" if is_gptqmodel_available() else None
 
     # called only once for all test in this class
     @classmethod
@@ -117,13 +122,15 @@ def setUpClass(cls):
         cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
 
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+        cls.config = AutoConfig.from_pretrained(cls.model_name)
 
-        quantization_config = GPTQConfig(
+        cls.quantization_config = GPTQConfig(
             bits=cls.bits,
             dataset=cls.dataset,
             tokenizer=cls.tokenizer,
             group_size=cls.group_size,
             desc_act=cls.desc_act,
+            sym=cls.sym,
             use_exllama=cls.use_exllama,
         )
 
@@ -131,7 +138,7 @@ def setUpClass(cls):
             cls.model_name,
             torch_dtype=torch.float16,
             device_map=cls.device_map,
-            quantization_config=quantization_config,
+            quantization_config=cls.quantization_config,
         )
 
     def test_memory_footprint(self):
@@ -142,7 +149,7 @@ def test_memory_footprint(self):
 
         mem_quantized = self.quantized_model.get_memory_footprint()
 
-        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)
 
     def test_device_and_dtype_assignment(self):
         r"""
@@ -150,7 +157,7 @@ def test_device_and_dtype_assignment(self):
         Checks also if other models are casted correctly.
         """
         # This should work
-        if self.device_map is None:
+        if self.device_map in (None, "cpu"):
             _ = self.quantized_model.to(0)
 
         with self.assertRaises(ValueError):
@@ -170,16 +177,36 @@ def test_quantized_layers_class(self):
         Simple test to check if the model conversion has been done correctly by checking on
         the class type of the linear layers of the converted models
         """
-        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
-
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=not self.use_exllama,
-            disable_exllamav2=True,
-        )
+        if is_gptqmodel_available():
+            from gptqmodel.utils.importer import hf_select_quant_linear
+
+            if hasattr(self.config, "quantization_config"):
+                checkpoint_format = self.config.quantization_config.get("checkpoint_format")
+                meta = self.config.quantization_config.get("meta")
+            else:
+                checkpoint_format = "gptq"
+                meta = None
+            QuantLinear = hf_select_quant_linear(
+                bits=self.bits,
+                group_size=self.group_size,
+                desc_act=self.desc_act,
+                sym=self.sym,
+                device_map=self.device_map,
+                checkpoint_format=checkpoint_format,
+                meta=meta,
+                backend=self.quantization_config.backend,
+            )
+        elif is_auto_gptq_available():
+            from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
+
+            QuantLinear = hf_select_quant_linear(
+                use_triton=False,
+                desc_act=self.desc_act,
+                group_size=self.group_size,
+                bits=self.bits,
+                disable_exllama=not self.use_exllama,
+                disable_exllamav2=True,
+            )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
     def check_inference_correctness(self, model):
@@ -192,7 +219,7 @@ def check_inference_correctness(self, model):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Check the exactness of the results
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)
 
         # Get the generation
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -207,6 +234,8 @@ def test_generate_quality(self):
         if self.device_map is None:
             self.check_inference_correctness(self.quantized_model.to(0))
         else:
+            if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
+                self.quantized_model.to("cpu")
             self.check_inference_correctness(self.quantized_model)
 
     def test_serialization(self):
@@ -215,15 +244,28 @@ def test_serialization(self):
         """
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname)
-            if not self.use_exllama:
-                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
-                    tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
-                ).to(0)
-                self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old")
+            if is_auto_gptq_available() and not is_gptqmodel_available():
+                quant_type = "cuda-old" if not self.use_exllama else "exllama"
+                if not self.use_exllama:
+                    quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                        tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
+                    )
+                    if self.device_map != "cpu":
+                        quantized_model_from_saved = quantized_model_from_saved.to(0)
+                else:
+                    quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                        tmpdirname, device_map=self.device_map
+                    )
             else:
-                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
-                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0})
-                self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
+                if self.device_map == "cpu":
+                    quant_type = "ipex" if is_ipex_available() else "torch"
+                else:
+                    quant_type = "exllama"
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                    tmpdirname, device_map=self.device_map
+                )
+
+            self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
             self.check_inference_correctness(quantized_model_from_saved)
 
     @require_accelerate
@@ -233,20 +275,26 @@ def test_serialization_big_model_inference(self):
         """
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname)
-            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
+            device_map = self.device_map or "auto"
+            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
             self.check_inference_correctness(quantized_model_from_saved)
 
+
+@require_torch_gpu
+class GPTQTestCUDA(GPTQTest):
+    device_map = {"": 0}
+
     def test_change_loading_attributes(self):
         """
         Test the serialization of the model and the loading of the quantized weights works with another config file
         """
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname)
-            if not self.use_exllama:
+            if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama:
                 self.check_quantized_layers_type(self.quantized_model, "cuda-old")
                 # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
                 quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
-                    tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map={"": 0}
+                    tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map
                 )
                 self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
                 self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
@@ -255,20 +303,20 @@ def test_change_loading_attributes(self):
 
 @require_accelerate
 @require_torch_multi_gpu
-class GPTQTestDeviceMap(GPTQTest):
+class GPTQTestDeviceMap(GPTQTestCUDA):
     device_map = "auto"
 
 
 @require_accelerate
 @require_torch_multi_gpu
-class GPTQTestDeviceMapExllama(GPTQTest):
+class GPTQTestDeviceMapExllama(GPTQTestCUDA):
     device_map = "auto"
     use_exllama = True
 
 
 @slow
 @require_optimum
-@require_auto_gptq
+@require_gptq
 @require_torch_gpu
 @require_accelerate
 class GPTQTestActOrderExllama(unittest.TestCase):
@@ -279,6 +327,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
     """
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
     # 4bit + act_order + 128g
     model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -343,7 +392,7 @@ def test_max_input_length(self):
 
 @slow
 @require_optimum
-@require_auto_gptq
+@require_gptq
 @require_torch_gpu
 @require_accelerate
 class GPTQTestExllamaV2(unittest.TestCase):
@@ -354,6 +403,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
     """
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
     # 4bit + act_order + 128g
     model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -374,7 +424,10 @@ def setUpClass(cls):
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
 
     def test_quantized_layers_type(self):
-        self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllamav2")
+        self.assertEqual(
+            self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+            "exllama" if is_gptqmodel_available() else "exllamav2",
+        )
 
     def check_inference_correctness(self, model):
         """
diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py
index 6d08a0f0e6691a..c25aada6ed48ed 100755
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@@ -19,6 +19,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
 from transformers.testing_utils import (
     require_accelerate,
+    require_hqq,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
@@ -86,6 +87,7 @@ def check_forward(test_module, model, batch_size=1, context_size=1024):
 
 
 @require_torch_gpu
+@require_hqq
 class HqqConfigTest(unittest.TestCase):
     def test_to_dict(self):
         """
@@ -100,6 +102,7 @@ def test_to_dict(self):
 @slow
 @require_torch_gpu
 @require_accelerate
+@require_hqq
 class HQQTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -122,6 +125,7 @@ def test_fp16_quantized_model(self):
 @require_torch_gpu
 @require_torch_multi_gpu
 @require_accelerate
+@require_hqq
 class HQQTestMultiGPU(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -144,6 +148,7 @@ def test_fp16_quantized_model_multipgpu(self):
 @slow
 @require_torch_gpu
 @require_accelerate
+@require_hqq
 class HQQSerializationTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index 08cc48d0cccd36..2022c336657636 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -21,6 +21,7 @@
     require_accelerate,
     require_optimum_quanto,
     require_read_token,
+    require_torch_accelerator,
     require_torch_gpu,
     slow,
     torch_device,
@@ -123,7 +124,7 @@ def test_conversion_with_modules_to_not_convert(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 @require_optimum_quanto
 @require_accelerate
 class QuantoQuantizationTest(unittest.TestCase):
@@ -268,7 +269,7 @@ def test_compare_with_quanto(self):
         quantize(model.transformer, weights=w_mapping[self.weights])
         freeze(model.transformer)
         self.check_same_model(model, self.quantized_model)
-        self.check_inference_correctness(model, device="cuda")
+        self.check_inference_correctness(model, device=torch_device)
 
     @unittest.skip
     def test_load_from_quanto_saved(self):
diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py
new file mode 100644
index 00000000000000..f5e133ce1fea46
--- /dev/null
+++ b/tests/repo_utils/modular/test_conversion_order.py
@@ -0,0 +1,63 @@
+import os
+import sys
+import unittest
+
+
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(ROOT_DIR, "utils"))
+
+import create_dependency_mapping  # noqa: E402
+
+
+# This is equivalent to `all` in the current library state (as of 09/01/2025)
+MODEL_ROOT = os.path.join("src", "transformers", "models")
+FILES_TO_PARSE = [
+    os.path.join(MODEL_ROOT, "starcoder2", "modular_starcoder2.py"),
+    os.path.join(MODEL_ROOT, "gemma", "modular_gemma.py"),
+    os.path.join(MODEL_ROOT, "olmo2", "modular_olmo2.py"),
+    os.path.join(MODEL_ROOT, "diffllama", "modular_diffllama.py"),
+    os.path.join(MODEL_ROOT, "granite", "modular_granite.py"),
+    os.path.join(MODEL_ROOT, "gemma2", "modular_gemma2.py"),
+    os.path.join(MODEL_ROOT, "mixtral", "modular_mixtral.py"),
+    os.path.join(MODEL_ROOT, "olmo", "modular_olmo.py"),
+    os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
+    os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
+    os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
+    os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
+    os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),
+    os.path.join(MODEL_ROOT, "colpali", "modular_colpali.py"),
+    os.path.join(MODEL_ROOT, "deformable_detr", "modular_deformable_detr.py"),
+    os.path.join(MODEL_ROOT, "aria", "modular_aria.py"),
+    os.path.join(MODEL_ROOT, "ijepa", "modular_ijepa.py"),
+    os.path.join(MODEL_ROOT, "bamba", "modular_bamba.py"),
+    os.path.join(MODEL_ROOT, "dinov2_with_registers", "modular_dinov2_with_registers.py"),
+    os.path.join(MODEL_ROOT, "instructblipvideo", "modular_instructblipvideo.py"),
+    os.path.join(MODEL_ROOT, "glm", "modular_glm.py"),
+    os.path.join(MODEL_ROOT, "phi", "modular_phi.py"),
+    os.path.join(MODEL_ROOT, "mistral", "modular_mistral.py"),
+    os.path.join(MODEL_ROOT, "phi3", "modular_phi3.py"),
+    os.path.join(MODEL_ROOT, "cohere", "modular_cohere.py"),
+]
+
+
+def appear_after(model1: str, model2: str, priority_list: list[str]) -> bool:
+    """Return True if `model1` appear after `model2` in `priority_list`."""
+    return priority_list.index(model1) > priority_list.index(model2)
+
+
+class ConversionOrderTest(unittest.TestCase):
+    def test_conversion_order(self):
+        # Find the order
+        priority_list = create_dependency_mapping.find_priority_list(FILES_TO_PARSE)
+        # Extract just the model names
+        model_priority_list = [file.rsplit("modular_")[-1].replace(".py", "") for file in priority_list]
+
+        # These are based on what the current library order should be (as of 09/01/2025)
+        self.assertTrue(appear_after("mixtral", "mistral", model_priority_list))
+        self.assertTrue(appear_after("gemma2", "gemma", model_priority_list))
+        self.assertTrue(appear_after("starcoder2", "mistral", model_priority_list))
+        self.assertTrue(appear_after("olmo2", "olmo", model_priority_list))
+        self.assertTrue(appear_after("diffllama", "mistral", model_priority_list))
+        self.assertTrue(appear_after("cohere2", "gemma2", model_priority_list))
+        self.assertTrue(appear_after("cohere2", "cohere", model_priority_list))
+        self.assertTrue(appear_after("phi3", "mistral", model_priority_list))
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 1cb92174df1d8a..971462f9e3528e 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -125,19 +125,19 @@ def prepare_video_inputs(
     assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
 
     video_inputs = []
-    for i in range(batch_size):
+    for _ in range(batch_size):
         if equal_resolution:
             width = height = max_resolution
         else:
             width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-            video = prepare_video(
-                num_frames=num_frames,
-                num_channels=num_channels,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
+        video = prepare_video(
+            num_frames=num_frames,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            numpify=numpify,
+            torchify=torchify,
+        )
         video_inputs.append(video)
 
     return video_inputs
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 708a3a54e39341..965d7593693397 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -122,7 +122,7 @@
     from torch import nn
 
     from transformers import MODEL_MAPPING, AdaptiveEmbedding
-    from transformers.cache_utils import DynamicCache
+    from transformers.cache_utils import Cache, DynamicCache
     from transformers.modeling_utils import load_state_dict, no_init_weights
     from transformers.pytorch_utils import id_tensor_storage
 
@@ -221,6 +221,8 @@ class ModelTesterMixin:
     test_mismatched_shapes = True
     test_missing_keys = True
     test_model_parallel = False
+    # Used in `check_training_gradient_checkpointing` to NOT check all params having gradient (e.g. for some MOE models)
+    test_all_params_have_gradient = True
     is_encoder_decoder = False
     has_attentions = True
     _is_composite = False
@@ -816,7 +818,10 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     ),
                 )
 
+        set_model_tester_for_less_flaky_test(self)
+
         config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
+        set_config_for_less_flaky_test(config)
         equivalence = get_tensor_equivalence_function(batched_input)
 
         for model_class in self.all_model_classes:
@@ -827,6 +832,7 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                 config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
             batched_input_prepared = self._prepare_for_class(batched_input, model_class)
             model = model_class(config).to(torch_device).eval()
+            set_model_for_less_flaky_test(model)
 
             batch_size = self.model_tester.batch_size
             single_row_input = {}
@@ -891,9 +897,10 @@ def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=No
                 loss.backward()
                 optimizer.step()
 
-                for k, v in model.named_parameters():
-                    if v.requires_grad:
-                        self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
+                if self.test_all_params_have_gradient:
+                    for k, v in model.named_parameters():
+                        if v.requires_grad:
+                            self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
 
     def test_training(self):
         if not self.model_tester.is_training:
@@ -1102,7 +1109,14 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                         attention_mask = inputs["attention_mask"]
                         decoder_input_ids = inputs["decoder_input_ids"]
                         decoder_attention_mask = inputs["decoder_attention_mask"]
-                        model(main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        outputs = model(main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        # `torchscript` doesn't work with outputs containing `Cache` object. However, #35235 makes
+                        # several models to use `Cache` by default instead of the legacy cache (tuple), and
+                        # their `torchscript` tests are failing. We won't support them anyway, but we still want to keep
+                        # the tests for encoder models like `BERT`. So we skip the checks if the model's output contains
+                        # a `Cache` object.
+                        if any(isinstance(x, Cache) for x in outputs):
+                            continue
                         traced_model = torch.jit.trace(
                             model, (main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
                         )
@@ -1110,14 +1124,18 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                         input_ids = inputs["input_ids"]
                         bbox = inputs["bbox"]
                         image = inputs["image"].tensor
-                        model(input_ids, bbox, image)
+                        outputs = model(input_ids, bbox, image)
+                        if any(isinstance(x, Cache) for x in outputs):
+                            continue
                         traced_model = torch.jit.trace(
                             model, (input_ids, bbox, image), check_trace=False
                         )  # when traced model is checked, an error is produced due to name mangling
                     elif "bbox" in inputs:  # Bros requires additional inputs (bbox)
                         input_ids = inputs["input_ids"]
                         bbox = inputs["bbox"]
-                        model(input_ids, bbox)
+                        outputs = model(input_ids, bbox)
+                        if any(isinstance(x, Cache) for x in outputs):
+                            continue
                         traced_model = torch.jit.trace(
                             model, (input_ids, bbox), check_trace=False
                         )  # when traced model is checked, an error is produced due to name mangling
@@ -1127,7 +1145,9 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                         pixel_values = inputs["pixel_values"]
                         prompt_pixel_values = inputs["prompt_pixel_values"]
                         prompt_masks = inputs["prompt_masks"]
-                        model(pixel_values, prompt_pixel_values, prompt_masks)
+                        outputs = model(pixel_values, prompt_pixel_values, prompt_masks)
+                        if any(isinstance(x, Cache) for x in outputs):
+                            continue
                         traced_model = torch.jit.trace(
                             model, (pixel_values, prompt_pixel_values, prompt_masks), check_trace=False
                         )  # when traced model is checked, an error is produced due to name mangling
@@ -1142,11 +1162,15 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                             else:
                                 self.skipTest(reason="testing SDPA without attention_mask is not supported")
 
-                            model(main_input, attention_mask=inputs["attention_mask"])
+                            outputs = model(main_input, attention_mask=inputs["attention_mask"])
+                            if any(isinstance(x, Cache) for x in outputs):
+                                continue
                             # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
                             traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
                         else:
-                            model(main_input)
+                            outputs = model(main_input)
+                            if any(isinstance(x, Cache) for x in outputs):
+                                continue
                             traced_model = torch.jit.trace(model, (main_input,))
                 except RuntimeError:
                     self.fail("Couldn't trace module.")
@@ -1838,7 +1862,6 @@ def test_resize_position_vector_embeddings(self):
     def test_resize_tokens_embeddings(self):
         if not self.test_resize_embeddings:
             self.skipTest(reason="test_resize_embeddings is set to `False`")
-
         (
             original_config,
             inputs_dict,
@@ -1993,7 +2016,7 @@ def test_resize_tokens_embeddings(self):
             torch.testing.assert_close(old_embeddings_mean, new_embeddings_mean, atol=1e-3, rtol=1e-1)
 
     @require_deepspeed
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_resize_tokens_embeddings_with_deepspeed(self):
         ds_config = {
             "zero_optimization": {
@@ -2099,7 +2122,7 @@ def test_resize_embeddings_untied(self):
                 model(**self._prepare_for_class(inputs_dict, model_class))
 
     @require_deepspeed
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_resize_embeddings_untied_with_deepspeed(self):
         ds_config = {
             "zero_optimization": {
@@ -2260,7 +2283,7 @@ def test_load_save_without_tied_weights(self):
 
     def test_tied_weights_keys(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = True
+        config.get_text_config().tie_word_embeddings = True
         for model_class in self.all_model_classes:
             model_tied = model_class(config)
 
@@ -3061,6 +3084,7 @@ def test_multi_gpu_data_parallel_forward(self):
             with torch.no_grad():
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
+    @require_torch_gpu
     @require_torch_multi_gpu
     def test_model_parallelization(self):
         if not self.test_model_parallel:
@@ -3123,6 +3147,7 @@ def get_current_gpu_memory_use():
             gc.collect()
             torch.cuda.empty_cache()
 
+    @require_torch_gpu
     @require_torch_multi_gpu
     def test_model_parallel_equal_results(self):
         if not self.test_model_parallel:
@@ -3176,7 +3201,7 @@ def check_device_map_is_respected(self, model, device_map):
 
     @require_accelerate
     @mark.accelerate_tests
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_disk_offload_bin(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3217,7 +3242,7 @@ def test_disk_offload_bin(self):
 
     @require_accelerate
     @mark.accelerate_tests
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_disk_offload_safetensors(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3252,7 +3277,7 @@ def test_disk_offload_safetensors(self):
 
     @require_accelerate
     @mark.accelerate_tests
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_cpu_offload(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3868,7 +3893,7 @@ def test_sdpa_can_dispatch_non_composite_models(self):
                 for name, submodule in model_eager.named_modules():
                     class_name = submodule.__class__.__name__
                     if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
+                        raise ValueError(f"The eager model should not have SDPA attention layers but got {class_name}")
 
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
@@ -4681,13 +4706,17 @@ def test_custom_4d_attention_mask(self):
                 reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
             )
 
+        set_model_tester_for_less_flaky_test(self)
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest(f"{model_class.__name__} is not guaranteed to work with custom 4D attention masks")
             config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            set_config_for_less_flaky_test(config)
             if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
                 self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
             model = model_class(config).to(device=torch_device, dtype=torch.float32)
+            set_model_for_less_flaky_test(model)
 
             (
                 input_ids,
@@ -4716,7 +4745,7 @@ def test_custom_4d_attention_mask(self):
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
 
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_torch_compile_for_training(self):
         if version.parse(torch.__version__) < version.parse("2.3"):
             self.skipTest(reason="This test requires torch >= 2.3 to run.")
@@ -4781,6 +4810,19 @@ def test_forward_with_num_logits_to_keep(self):
             # Assert the last tokens are actually the same (except for the natural fluctuation due to order of FP ops)
             self.assertTrue(torch.allclose(all_logits[:, -1:, :], last_token_logits, atol=1e-5))
 
+    @require_torch_gpu
+    def test_flex_attention_with_grads(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flex_attn:
+                self.skipTest(reason="This model does not support flex attention")
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config._attn_implementation = "flex_attention"
+            model = model_class(config).to(device=torch_device, dtype=torch.float16)
+            self.assertTrue(model.config._attn_implementation == "flex_attention")
+
+            # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
+            _ = model(inputs_dict["input_ids"].to(torch_device))
+
 
 global_rng = random.Random()
 
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 4c4f6fac49813f..d81386f9e0c7db 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -174,8 +174,9 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
@@ -195,8 +196,9 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             "image_processor", do_rescale=True, rescale_factor=-1
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -210,8 +212,9 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
@@ -228,8 +231,9 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
             "image_processor", do_rescale=True, rescale_factor=1
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -242,7 +246,8 @@ def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -264,7 +269,8 @@ def test_unstructured_kwargs_batched(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs(batch_size=2)
@@ -289,7 +295,8 @@ def test_doubly_passed_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = [self.prepare_text_inputs()]
@@ -307,7 +314,8 @@ def test_structured_kwargs_nested(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = self.prepare_text_inputs()
@@ -330,7 +338,8 @@ def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = self.prepare_text_inputs()
         image_input = self.prepare_image_inputs()
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index ed09d800ad6dd5..9bf90efd4b5c70 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1107,7 +1107,9 @@ def test_chat_template(self):
                 tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)
 
                 with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    tokenizer.save_pretrained(tmp_dir_name)
+                    save_files = tokenizer.save_pretrained(tmp_dir_name)
+                    # Check we aren't saving a chat_template.jinja file
+                    self.assertFalse(any(file.endswith("chat_template.jinja") for file in save_files))
                     new_tokenizer = tokenizer.from_pretrained(tmp_dir_name)
 
                 self.assertEqual(new_tokenizer.chat_template, dummy_template)  # Test template has persisted
@@ -1117,7 +1119,9 @@ def test_chat_template(self):
                 new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)
 
                 with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=True)
+                    save_files = tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=True)
+                    # Check we are saving a chat_template.jinja file
+                    self.assertTrue(any(file.endswith("chat_template.jinja") for file in save_files))
                     chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
                     self.assertTrue(chat_template_file.is_file())
                     self.assertEqual(chat_template_file.read_text(), dummy_template)
@@ -4680,3 +4684,15 @@ def test_tokenizer_initialization_with_conflicting_key(self):
 
         with self.assertRaises(AttributeError, msg="conflicts with the method"):
             get_tokenizer_func(get_vocab=True)
+
+    @parameterized.expand([(True,), (False,)])
+    def test_rust_tokenizer_add_prefix_space(self, add_prefix_space):
+        if not self.test_rust_tokenizer:
+            self.skipTest(reason="test_rust_tokenizer is set to False")
+
+        for tokenizer, pretrained_name, _ in self.tokenizers_list:
+            fast_tokenizer = tokenizer.from_pretrained(pretrained_name, add_prefix_space=add_prefix_space)
+            self.assertEqual(fast_tokenizer.add_prefix_space, add_prefix_space)
+            # Only the ByteLevel pre-tokenizer has the `add_prefix_space` attribute, we have to ensure that it's set correctly
+            if hasattr(fast_tokenizer.backend_tokenizer.pre_tokenizer, "add_prefix_space"):
+                self.assertEqual(fast_tokenizer.backend_tokenizer.pre_tokenizer.add_prefix_space, add_prefix_space)
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index 5171af67300813..a9f2b1cd9b75e9 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -48,6 +48,7 @@
 
 
 if is_tokenizers_available():
+    import tokenizers
     from tokenizers import Tokenizer
     from tokenizers.models import WordPiece
 
@@ -428,3 +429,21 @@ def test_sentencepiece_cohabitation(self):
         # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
         # was already imported.
         import_protobuf()
+
+    def test_training_new_tokenizer_edge_cases(self):
+        _tokenizer = Tokenizer(tokenizers.models.BPE(vocab={"a": 1, "b": 2, "ab": 3}, merges=[("a", "b")]))
+        _tokenizer.pre_tokenizer = None
+
+        tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
+        toy_text_iterator = ("a" for _ in range(1000))
+        tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
+
+        _tokenizer.normalizer = None
+        tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
+        toy_text_iterator = ("a" for _ in range(1000))
+        tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
+
+        _tokenizer.post_processor = None
+        tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer)
+        toy_text_iterator = ("a" for _ in range(1000))
+        tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50)
diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py
index 70870be7718bee..c3e9b5a3badf21 100644
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -1020,6 +1020,52 @@ def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         self.assertTrue(tf.reduce_any(masked_tokens))
         # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
 
+    def test_probability_sum_error(self):
+        """Test that the sum of mask_replace_prob and random_replace_prob exceeding 1 raises an error."""
+        tokenizer = BertTokenizer(self.vocab_file)
+        with self.assertRaises(ValueError):
+            DataCollatorForLanguageModeling(tokenizer=tokenizer, mask_replace_prob=0.9, random_replace_prob=0.2)
+
+    def test_all_mask_replacement(self):
+        """Test behavior when mask_replace_prob=1."""
+        tokenizer = BertTokenizer(self.vocab_file)
+
+        # pytorch call
+        collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="pt"
+        )
+
+        inputs = torch.tensor([0, 1, 2, 3, 4, 5])
+        features = [{"input_ids": inputs} for _ in range(8)]
+        batch = collator(features)
+
+        # confirm that every token is either the original token or [MASK]
+        self.assertTrue(torch.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
+
+        # tf call
+        collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="tf"
+        )
+        inputs = tf.constant([0, 1, 2, 3, 4, 5])
+        features = [{"input_ids": inputs} for _ in range(8)]
+        batch = collator(features)
+
+        # confirm that every token is either the original token or [MASK]
+        self.assertTrue(
+            tf.reduce_all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id))
+        )
+
+        # numpy call
+        collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mask_replace_prob=1, random_replace_prob=0, return_tensors="np"
+        )
+        inputs = np.array([0, 1, 2, 3, 4, 5])
+        features = [{"input_ids": inputs} for _ in range(8)]
+        batch = collator(features)
+
+        # confirm that every token is either the original token or [MASK]
+        self.assertTrue(np.all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id)))
+
     def test_data_collator_for_language_modeling(self):
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
         pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index d33be2789761da..6e90b3d7e4059d 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -526,7 +526,15 @@ def forward(self, x):
             return self.ln2(x + h + self.bias)
 
     def get_regression_trainer(
-        a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs
+        a=0,
+        b=0,
+        double_output=False,
+        train_len=64,
+        eval_len=64,
+        pretrained=True,
+        keep_report_to=False,
+        output_dir=None,
+        **kwargs,
     ):
         label_names = kwargs.get("label_names", None)
         gradient_checkpointing = kwargs.get("gradient_checkpointing", False)
@@ -552,9 +560,8 @@ def get_regression_trainer(
         compute_metrics = kwargs.pop("compute_metrics", None)
         data_collator = kwargs.pop("data_collator", None)
         optimizers = kwargs.pop("optimizers", (None, None))
-        output_dir = kwargs.pop("output_dir", "./regression")
         preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
-
+        assert output_dir is not None, "output_dir should be specified for testing"
         args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs)
         return Trainer(
             model,
@@ -678,13 +685,15 @@ def setUp(self):
         args = TrainingArguments("..")
         self.n_epochs = args.num_train_epochs
         self.batch_size = args.train_batch_size
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.default_trained_model = (trainer.model.a, trainer.model.b)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
+            trainer.train()
+            self.default_trained_model = (trainer.model.a, trainer.model.b)
 
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.alternate_trained_model = (trainer.model.a, trainer.model.b)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
+            trainer.train()
+            self.alternate_trained_model = (trainer.model.a, trainer.model.b)
 
     def check_trained_model(self, model, alternate_seed=False):
         # Checks a training seeded with learning_rate = 0.1
@@ -694,14 +703,16 @@ def check_trained_model(self, model, alternate_seed=False):
 
     def test_reproducible_training(self):
         # Checks that training worked, model trained and seed made a reproducible training.
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
         # Checks that a different seed gets different (reproducible) results.
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
+            trainer.train()
+            self.check_trained_model(trainer.model, alternate_seed=True)
 
     def test_trainer_with_datasets(self):
         import datasets
@@ -713,41 +724,43 @@ def test_trainer_with_datasets(self):
 
         # Base training. Should have the same results as test_reproducible_training
         model = RegressionModel()
-        args = TrainingArguments("./regression", learning_rate=0.1, report_to="none")
-        trainer = Trainer(model, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none")
+            trainer = Trainer(model, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Can return tensors.
-        train_dataset.set_format(type="torch", dtype=torch.float32)
-        model = RegressionModel()
-        trainer = Trainer(model, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Can return tensors.
+            train_dataset.set_format(type="torch", dtype=torch.float32)
+            model = RegressionModel()
+            trainer = Trainer(model, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Adding one column not used by the model should have no impact
-        z = np.random.normal(size=(64,)).astype(np.float32)
-        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
-        model = RegressionModel()
-        trainer = Trainer(model, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Adding one column not used by the model should have no impact
+            z = np.random.normal(size=(64,)).astype(np.float32)
+            train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
+            model = RegressionModel()
+            trainer = Trainer(model, args, train_dataset=train_dataset)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
     def test_model_init(self):
         train_dataset = RegressionDataset()
-        args = TrainingArguments("./regression", learning_rate=0.1, report_to="none")
-        trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none")
+            trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Re-training should restart from scratch, thus lead the same results.
-        trainer.train()
-        self.check_trained_model(trainer.model)
+            # Re-training should restart from scratch, thus lead the same results.
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
-        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
-        trainer.args.seed = 314
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
+            # Re-training should restart from scratch, thus lead the same results and new seed should be used.
+            trainer.args.seed = 314
+            trainer.train()
+            self.check_trained_model(trainer.model, alternate_seed=True)
 
     @slow
     def test_gradient_accumulation_loss_alignment_with_model_loss(self):
@@ -782,63 +795,67 @@ def tokenize_function(examples):
             "disable_tqdm": True,
         }
 
-        args = TrainingArguments(
-            "./generation",
-            **args_kwargs,
-        )
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[base_loss_callback],
-            data_collator=data_collator,
-        )
-        assert trainer.model_accepts_loss_kwargs
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                **args_kwargs,
+            )
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[base_loss_callback],
+                data_collator=data_collator,
+            )
+            assert trainer.model_accepts_loss_kwargs
+            trainer.train()
 
         grad_accum_loss_callback = StoreLossCallback()
-        args = TrainingArguments(
-            "./generation",
-            **args_kwargs,
-            gradient_accumulation_steps=2,
-            per_device_train_batch_size=4,
-        )
-        set_seed(42)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[grad_accum_loss_callback],
-            data_collator=data_collator,
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                **args_kwargs,
+                gradient_accumulation_steps=2,
+                per_device_train_batch_size=4,
+            )
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[grad_accum_loss_callback],
+                data_collator=data_collator,
+            )
+            trainer.train()
 
-        set_seed(42)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        broken_loss_callback = StoreLossCallback()
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[broken_loss_callback],
-            data_collator=data_collator,
-        )
-        # disable model_accepts_loss_kwargs
-        trainer.model_accepts_loss_kwargs = False
-        trainer.train()
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            broken_loss_callback = StoreLossCallback()
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[broken_loss_callback],
+                data_collator=data_collator,
+            )
+            # disable model_accepts_loss_kwargs
+            trainer.model_accepts_loss_kwargs = False
+            trainer.train()
 
-        # Calculate the difference between the base loss and the grad_accum loss
-        diff_truth = [
-            abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
-        ]
-        diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]
+            # Calculate the difference between the base loss and the grad_accum loss
+            diff_truth = [
+                abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
+            ]
+            diff_broken = [
+                abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
+            ]
 
-        # all diff truth should be quite close
-        self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
+            # all diff truth should be quite close
+            self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
 
-        # max diff broken should be very off
-        self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3")
+            # max diff broken should be very off
+            self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3")
 
     @slow
     def test_gradient_accumulation_loss_alignment_with_loss_func(self):
@@ -879,125 +896,135 @@ def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_ite
             "disable_tqdm": True,
         }
 
-        args = TrainingArguments(
-            "./generation",
-            **args_kwargs,
-        )
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[base_loss_callback],
-            compute_loss_func=loss_fn,
-            data_collator=data_collator,
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                **args_kwargs,
+            )
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[base_loss_callback],
+                compute_loss_func=loss_fn,
+                data_collator=data_collator,
+            )
+            trainer.train()
 
         grad_accum_loss_callback = StoreLossCallback()
-        args = TrainingArguments(
-            "./generation",
-            **args_kwargs,
-            gradient_accumulation_steps=2,
-            per_device_train_batch_size=4,
-        )
-        set_seed(42)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[grad_accum_loss_callback],
-            compute_loss_func=loss_fn,
-            data_collator=data_collator,
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                **args_kwargs,
+                gradient_accumulation_steps=2,
+                per_device_train_batch_size=4,
+            )
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[grad_accum_loss_callback],
+                compute_loss_func=loss_fn,
+                data_collator=data_collator,
+            )
+            trainer.train()
 
-        set_seed(42)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        broken_loss_callback = StoreLossCallback()
-        loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True)
-        trainer = Trainer(
-            model,
-            args,
-            train_dataset=tokenized_dataset["train"],
-            callbacks=[broken_loss_callback],
-            compute_loss_func=loss_fn,
-            data_collator=data_collator,
-        )
-        trainer.train()
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            broken_loss_callback = StoreLossCallback()
+            loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True)
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=tokenized_dataset["train"],
+                callbacks=[broken_loss_callback],
+                compute_loss_func=loss_fn,
+                data_collator=data_collator,
+            )
+            trainer.train()
 
-        # Calculate the difference between the base loss and the grad_accum loss
-        diff_truth = [
-            abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
-        ]
-        diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]
+            # Calculate the difference between the base loss and the grad_accum loss
+            diff_truth = [
+                abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
+            ]
+            diff_broken = [
+                abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
+            ]
 
-        # all diff truth should be quite close
-        self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
+            # all diff truth should be quite close
+            self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
 
-        # max diff broken should be very off
-        self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3")
+            # max diff broken should be very off
+            self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3")
 
     def test_gradient_accumulation(self):
         # Training with half the batch size but accumulation steps as 2 should give the same training losses.
-        trainer = get_regression_trainer(
-            gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
-        )
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1, output_dir=tmp_dir
+            )
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
     def test_gradient_checkpointing(self):
-        trainer = get_regression_trainer(
-            per_device_train_batch_size=1,
-            learning_rate=0.1,
-            gradient_checkpointing=True,
-            gradient_checkpointing_kwargs={"use_reentrant": False},
-        )
-        previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                per_device_train_batch_size=1,
+                learning_rate=0.1,
+                gradient_checkpointing=True,
+                gradient_checkpointing_kwargs={"use_reentrant": False},
+                output_dir=tmp_dir,
+            )
+            previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}
 
-        trainer.train()
+            trainer.train()
 
-        # Check if model weights have been updated
-        for k, v in trainer.model.named_parameters():
-            self.assertFalse(
-                torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
-                f"Model weights for {k} have not been updated",
-            )
+            # Check if model weights have been updated
+            for k, v in trainer.model.named_parameters():
+                self.assertFalse(
+                    torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
+                    f"Model weights for {k} have not been updated",
+                )
 
     def test_training_loss(self):
         n_gpus = max(1, backend_device_count(torch_device))
 
         # With even logs
-        trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus))
-        trainer.train()
-        log_history = trainer.state.log_history
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus), output_dir=tmp_dir)
+            trainer.train()
+            log_history = trainer.state.log_history
 
-        losses = [log["loss"] for log in log_history if "loss" in log]
-        train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
+            losses = [log["loss"] for log in log_history if "loss" in log]
+            train_loss = log_history[-1]["train_loss"]
+            self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
 
         # With uneven logs
-        trainer = get_regression_trainer(logging_steps=5)
-        trainer.train()
-        log_history = trainer.state.log_history
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(logging_steps=5, output_dir=tmp_dir)
+            trainer.train()
+            log_history = trainer.state.log_history
 
-        # Training loss should be the same as before
-        new_train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(train_loss, new_train_loss, places=4)
+            # Training loss should be the same as before
+            new_train_loss = log_history[-1]["train_loss"]
+            self.assertAlmostEqual(train_loss, new_train_loss, places=4)
 
     def test_custom_optimizer(self):
         train_dataset = RegressionDataset()
-        args = TrainingArguments("./regression", report_to="none")
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
-        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, report_to="none")
+            model = RegressionModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+            lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
+            trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
+            trainer.train()
 
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
+            (a, b) = self.default_trained_model
+            self.assertFalse(torch.allclose(trainer.model.a, a))
+            self.assertFalse(torch.allclose(trainer.model.b, b))
+            self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
 
     def test_lr_scheduler_kwargs(self):
         # test scheduler kwargs passed via TrainingArguments
@@ -1005,74 +1032,81 @@ def test_lr_scheduler_kwargs(self):
         model = RegressionModel()
         num_steps, num_warmup_steps = 10, 2
         extra_kwargs = {"power": 5.0, "lr_end": 1e-5}  # Non-default arguments
-        args = TrainingArguments(
-            "./regression",
-            lr_scheduler_type="polynomial",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            report_to="none",
-        )
-        trainer = Trainer(model, args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                lr_scheduler_type="polynomial",
+                lr_scheduler_kwargs=extra_kwargs,
+                learning_rate=0.2,
+                warmup_steps=num_warmup_steps,
+                report_to="none",
+            )
+            trainer = Trainer(model, args, train_dataset=train_dataset)
+            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
+            # Checking that the scheduler was created
+            self.assertIsNotNone(trainer.lr_scheduler)
 
-        # Checking that the correct args were passed
-        sched1 = trainer.lr_scheduler
-        sched2 = get_polynomial_decay_schedule_with_warmup(
-            trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
-        )
-        self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
-        self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
+            # Checking that the correct args were passed
+            sched1 = trainer.lr_scheduler
+            sched2 = get_polynomial_decay_schedule_with_warmup(
+                trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
+            )
+            self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
+            self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
 
     def test_cosine_with_min_lr_scheduler(self):
         train_dataset = RegressionDataset()
         model = RegressionModel()
         num_steps, num_warmup_steps = 10, 2
         extra_kwargs = {"min_lr": 1e-5}  # Non-default arguments
-        args = TrainingArguments(
-            "./regression",
-            lr_scheduler_type="cosine_with_min_lr",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            report_to="none",
-        )
-        trainer = Trainer(model, args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                lr_scheduler_type="cosine_with_min_lr",
+                lr_scheduler_kwargs=extra_kwargs,
+                learning_rate=0.2,
+                warmup_steps=num_warmup_steps,
+                report_to="none",
+            )
+            trainer = Trainer(model, args, train_dataset=train_dataset)
+            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
+            # Checking that the scheduler was created
+            self.assertIsNotNone(trainer.lr_scheduler)
 
-        # Check the last learning rate
-        for _ in range(num_steps):
-            trainer.lr_scheduler.step()
-        self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
+            # Check the last learning rate
+            for _ in range(num_steps):
+                trainer.lr_scheduler.step()
+            self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
 
     def test_reduce_lr_on_plateau_args(self):
         # test passed arguments for a custom ReduceLROnPlateau scheduler
         train_dataset = RegressionDataset(length=64)
         eval_dataset = RegressionDataset(length=64)
-        args = TrainingArguments(
-            "./regression",
-            eval_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            report_to="none",
-        )
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
-        trainer = Trainer(
-            model, args, train_dataset=train_dataset, eval_dataset=eval_dataset, optimizers=(optimizer, lr_scheduler)
-        )
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                eval_strategy="epoch",
+                metric_for_best_model="eval_loss",
+                report_to="none",
+            )
+            model = RegressionModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+            lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
+            trainer = Trainer(
+                model,
+                args,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+                optimizers=(optimizer, lr_scheduler),
+            )
+            trainer.train()
 
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        self.assertEqual(trainer.lr_scheduler.factor, 0.2)
-        self.assertEqual(trainer.lr_scheduler.patience, 5)
-        self.assertEqual(trainer.lr_scheduler.cooldown, 2)
+            self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+            self.assertEqual(trainer.lr_scheduler.factor, 0.2)
+            self.assertEqual(trainer.lr_scheduler.patience, 5)
+            self.assertEqual(trainer.lr_scheduler.cooldown, 2)
 
     def test_reduce_lr_on_plateau(self):
         # test the ReduceLROnPlateau scheduler
@@ -1087,39 +1121,40 @@ def log(self, logs):
         train_dataset = RegressionDataset(length=64)
         eval_dataset = RegressionDataset(length=64)
 
-        args = TrainingArguments(
-            "./regression",
-            lr_scheduler_type="reduce_lr_on_plateau",
-            eval_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            num_train_epochs=10,
-            learning_rate=0.2,
-            report_to="none",
-        )
-        model = RegressionModel()
-        trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir,
+                lr_scheduler_type="reduce_lr_on_plateau",
+                eval_strategy="epoch",
+                metric_for_best_model="eval_loss",
+                num_train_epochs=10,
+                learning_rate=0.2,
+                report_to="none",
+            )
+            model = RegressionModel()
+            trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
 
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        patience = trainer.lr_scheduler.patience
-
-        logs = trainer.state.log_history[1:]
-        best_loss = logs[0]["eval_loss"]
-        bad_epochs = 0
-        for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
-            loss = log["eval_loss"]
-            just_decreased = False
-            if loss > best_loss:
-                bad_epochs += 1
-                if bad_epochs > patience:
-                    self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
-                    just_decreased = True
+            self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
+            patience = trainer.lr_scheduler.patience
+
+            logs = trainer.state.log_history[1:]
+            best_loss = logs[0]["eval_loss"]
+            bad_epochs = 0
+            for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
+                loss = log["eval_loss"]
+                just_decreased = False
+                if loss > best_loss:
+                    bad_epochs += 1
+                    if bad_epochs > patience:
+                        self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
+                        just_decreased = True
+                        bad_epochs = 0
+                else:
+                    best_loss = loss
                     bad_epochs = 0
-            else:
-                best_loss = loss
-                bad_epochs = 0
-            if not just_decreased:
-                self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
+                if not just_decreased:
+                    self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
 
     def test_adafactor_lr_none(self):
         # test the special case where lr=None, since Trainer can't not have lr_scheduler
@@ -1127,29 +1162,36 @@ def test_adafactor_lr_none(self):
         from transformers.optimization import Adafactor, AdafactorSchedule
 
         train_dataset = RegressionDataset()
-        args = TrainingArguments("./regression", report_to="none")
-        model = RegressionModel()
-        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
-        lr_scheduler = AdafactorSchedule(optimizer)
-        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
-        trainer.train()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, report_to="none")
+            model = RegressionModel()
+            optimizer = Adafactor(
+                model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None
+            )
+            lr_scheduler = AdafactorSchedule(optimizer)
+            trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
+            trainer.train()
 
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
+            (a, b) = self.default_trained_model
+            self.assertFalse(torch.allclose(trainer.model.a, a))
+            self.assertFalse(torch.allclose(trainer.model.b, b))
+            self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
 
     @require_torch_accelerator
     @require_torch_bf16
     def test_mixed_bf16(self):
         # very basic test
-        trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
         # --bf16 --half_precision_backend apex can't be used together
-        with self.assertRaises(ValueError):
-            trainer = get_regression_trainer(learning_rate=0.1, bf16=True, half_precision_backend="apex")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(ValueError):
+                trainer = get_regression_trainer(
+                    learning_rate=0.1, bf16=True, half_precision_backend="apex", output_dir=tmp_dir
+                )
 
         # will add more specific tests once there are some bugs to fix
 
@@ -1158,9 +1200,10 @@ def test_mixed_bf16(self):
     @require_torch_tf32
     def test_tf32(self):
         # very basic test
-        trainer = get_regression_trainer(learning_rate=0.1, tf32=True)
-        trainer.train()
-        self.check_trained_model(trainer.model)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, tf32=True, output_dir=tmp_dir)
+            trainer.train()
+            self.check_trained_model(trainer.model)
 
 
 @require_torch
@@ -1179,7 +1222,7 @@ def test_trainer_works_with_dict(self):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionDictModel()
-        args = TrainingArguments("./regression", report_to="none")
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
         trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
         trainer.train()
         _ = trainer.evaluate()
@@ -1190,7 +1233,7 @@ def test_evaluation_with_keys_to_drop(self):
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         eval_dataset = RepeatDataset(x)
-        args = TrainingArguments("./test", report_to="none")
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
         trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
         # By default the past_key_values are removed
         result = trainer.predict(eval_dataset)
@@ -1201,9 +1244,10 @@ def test_evaluation_with_keys_to_drop(self):
         self.assertEqual(len(result.predictions), 2)
 
     def test_training_arguments_are_left_untouched(self):
-        trainer = get_regression_trainer()
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
         trainer.train()
-        args = TrainingArguments("./regression", report_to=[])
+        args = TrainingArguments(tmp_dir, report_to=[])
         dict1, dict2 = args.to_dict(), trainer.args.to_dict()
         for key in dict1.keys():
             # Logging dir can be slightly different as they default to something with the time.
@@ -1212,17 +1256,18 @@ def test_training_arguments_are_left_untouched(self):
 
     def test_number_of_steps_in_training(self):
         # Regular training has n_epochs * len(train_dl) steps
-        trainer = get_regression_trainer(learning_rate=0.1)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
         train_output = trainer.train()
         self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
 
         # Check passing num_train_epochs works (and a float version too):
-        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5)
+        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir)
         train_output = trainer.train()
         self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
 
         # If we pass a max_steps, num_train_epochs is ignored
-        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10)
+        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir)
         train_output = trainer.train()
         self.assertEqual(train_output.global_step, 10)
 
@@ -1230,21 +1275,29 @@ def test_number_of_steps_in_training(self):
     @require_intel_extension_for_pytorch
     def test_number_of_steps_in_training_with_ipex(self):
         for mix_bf16 in [True, False]:
+            tmp_dir = self.get_auto_remove_tmp_dir()
             # Regular training has n_epochs * len(train_dl) steps
-            trainer = get_regression_trainer(learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True)
+            trainer = get_regression_trainer(
+                learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+            )
             train_output = trainer.train()
             self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size)
 
             # Check passing num_train_epochs works (and a float version too):
             trainer = get_regression_trainer(
-                learning_rate=0.1, num_train_epochs=1.5, use_ipex=True, bf16=mix_bf16, use_cpu=True
+                learning_rate=0.1,
+                num_train_epochs=1.5,
+                use_ipex=True,
+                bf16=mix_bf16,
+                use_cpu=True,
+                output_dir=tmp_dir,
             )
             train_output = trainer.train()
             self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size))
 
             # If we pass a max_steps, num_train_epochs is ignored
             trainer = get_regression_trainer(
-                learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True
+                learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
             )
             train_output = trainer.train()
             self.assertEqual(train_output.global_step, 10)
@@ -1256,15 +1309,14 @@ def test_torch_compile_loss_func_compatibility(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                per_device_train_batch_size=2,
-                torch_compile=True,
-                max_steps=1,  # compile happens on the first step
-            )
-            trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
-            trainer.train()
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            per_device_train_batch_size=2,
+            torch_compile=True,
+            max_steps=1,  # compile happens on the first step
+        )
+        trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
+        trainer.train()
 
     @require_peft
     @require_bitsandbytes
@@ -1293,14 +1345,13 @@ def test_bnb_compile(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                learning_rate=1e-9,
-                logging_steps=5,
-            )
-            with self.assertRaises(ValueError):
-                _ = Trainer(tiny_model, args, train_dataset=train_dataset)  # noqa
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+        )
+        with self.assertRaises(ValueError):
+            _ = Trainer(tiny_model, args, train_dataset=train_dataset)  # noqa
 
     @require_peft
     def test_multiple_peft_adapters(self):
@@ -1332,32 +1383,32 @@ def test_multiple_peft_adapters(self):
 
         tokenizer.pad_token = tokenizer.eos_token
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            args = TrainingArguments(
-                tmpdir,
-                per_device_train_batch_size=1,
-                learning_rate=1e-9,
-                save_steps=5,
-                logging_steps=5,
-                max_steps=10,
-                use_cpu=True,
-            )
-            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        args = TrainingArguments(
+            tmp_dir,
+            per_device_train_batch_size=1,
+            learning_rate=1e-9,
+            save_steps=5,
+            logging_steps=5,
+            max_steps=10,
+            use_cpu=True,
+        )
+        trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
-            trainer.train()
-            parameters = dict(tiny_model.named_parameters())
-            state = dataclasses.asdict(trainer.state)
+        trainer.train()
+        parameters = dict(tiny_model.named_parameters())
+        state = dataclasses.asdict(trainer.state)
 
-            # Reinitialize trainer
-            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
+        # Reinitialize trainer
+        trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            parameters1 = dict(tiny_model.named_parameters())
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(parameters, parameters1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        parameters1 = dict(tiny_model.named_parameters())
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(parameters, parameters1)
+        self.check_trainer_state_are_the_same(state, state1)
 
     @require_bitsandbytes
     def test_rmsprop_bnb(self):
@@ -1366,15 +1417,18 @@ def test_rmsprop_bnb(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_ademamix_bnb(self):
@@ -1383,15 +1437,18 @@ def test_ademamix_bnb(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="ademamix",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_ademamix_bnb_8bit(self):
@@ -1400,15 +1457,18 @@ def test_ademamix_bnb_8bit(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix_8bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="ademamix_8bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_rmsprop_bnb_8bit(self):
@@ -1417,15 +1477,18 @@ def test_rmsprop_bnb_8bit(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb_8bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_rmsprop_bnb_32bit(self):
@@ -1433,15 +1496,18 @@ def test_rmsprop_bnb_32bit(self):
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb_32bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     def test_neftune(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
@@ -1451,7 +1517,7 @@ def test_neftune(self):
 
         # Trainer without inf/nan filter
         args = TrainingArguments(
-            "./test",
+            self.get_auto_remove_tmp_dir(),
             learning_rate=1e-9,
             logging_steps=5,
             logging_nan_inf_filter=False,
@@ -1473,7 +1539,7 @@ def test_neftune(self):
         tiny_gpt2 = GPT2LMHeadModel(config)
         # Trainer without inf/nan filter
         args = TrainingArguments(
-            "./test",
+            self.get_auto_remove_tmp_dir(),
             learning_rate=1e-9,
             logging_steps=5,
             logging_nan_inf_filter=False,
@@ -1505,7 +1571,11 @@ def test_logging_inf_nan_filter(self):
 
         # Trainer without inf/nan filter
         args = TrainingArguments(
-            "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            report_to="none",
         )
         trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
         trainer.train()
@@ -1513,7 +1583,11 @@ def test_logging_inf_nan_filter(self):
 
         # Trainer with inf/nan filter
         args = TrainingArguments(
-            "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e9,
+            logging_steps=5,
+            logging_nan_inf_filter=True,
+            report_to="none",
         )
         trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
         trainer.train()
@@ -1531,14 +1605,21 @@ def test_train_and_eval_dataloaders(self):
             n_gpu = max(1, backend_device_count(torch_device))
         else:
             n_gpu = 1
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
         self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
         self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
 
         # Check drop_last works
         trainer = get_regression_trainer(
-            train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32
+            train_len=66,
+            eval_len=74,
+            learning_rate=0.1,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=32,
+            output_dir=tmp_dir,
         )
         self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1)
         self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1)
@@ -1550,6 +1631,7 @@ def test_train_and_eval_dataloaders(self):
             per_device_train_batch_size=16,
             per_device_eval_batch_size=32,
             dataloader_drop_last=True,
+            output_dir=tmp_dir,
         )
         self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu))
         self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu))
@@ -1561,22 +1643,21 @@ def test_train_and_eval_dataloaders(self):
     # tests that we do not require dataloader to have a .dataset attribute
     def test_dataloader_without_dataset(self):
         train_dataset = RegressionDataset(length=128)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = CustomDataloaderTrainer(
-                model=RegressionModel(),
-                train_dataset=train_dataset,
-                eval_dataset=train_dataset,
-                args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
-            )
+        trainer = CustomDataloaderTrainer(
+            model=RegressionModel(),
+            train_dataset=train_dataset,
+            eval_dataset=train_dataset,
+            args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"),
+        )
 
-            trainer.train()
-            trainer.evaluate()
+        trainer.train()
+        trainer.evaluate()
 
     def test_get_eval_dataloader_without_persistent_workers(self):
         train_dataset = RegressionDataset()
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
-        args = TrainingArguments("./test", report_to="none", dataloader_persistent_workers=False)
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False)
 
         # Single evaluation dataset
         eval_dataset = RegressionDataset()
@@ -1620,7 +1701,7 @@ def test_get_eval_dataloader_with_persistent_workers(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
         args = TrainingArguments(
-            "./test",
+            self.get_auto_remove_tmp_dir(),
             report_to="none",
             dataloader_persistent_workers=True,
             dataloader_num_workers=2,
@@ -1679,7 +1760,7 @@ def test_use_liger_kernel_patching(self):
             self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
 
             args = TrainingArguments(
-                "./test",
+                self.get_auto_remove_tmp_dir(),
                 use_liger_kernel=True,
             )
             Trainer(tiny_llama, args)
@@ -1698,12 +1779,13 @@ def test_use_liger_kernel_trainer(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_lomo
     @require_torch_gpu
@@ -1716,13 +1798,14 @@ def test_lomo(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
         for name, param in tiny_llama.named_parameters():
             self.assertFalse(torch.allclose(param, previous_params[name].to(param.device), rtol=1e-12, atol=1e-12))
@@ -1735,61 +1818,58 @@ def test_adalomo(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="adalomo",
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="adalomo",
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_grokadamw
-    @require_torch_gpu
-    def test_grokadamw():
+    @require_torch_accelerator
+    def test_grokadamw(self):
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
         tiny_llama = LlamaForCausalLM(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=2e-5,
-                logging_steps=5,
-                optim="grokadamw",
-                max_steps=20,
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=2e-5,
+            logging_steps=5,
+            optim="grokadamw",
+            max_steps=20,
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_schedulefree
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_schedulefree_adam(self):
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
         tiny_llama = LlamaForCausalLM(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="schedule_free_adamw",
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="schedule_free_adamw",
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     def test_galore_matched_modules(self):
         regex_patterns = [r".*.attn.*", r".*.mlp.*"]
@@ -1880,19 +1960,18 @@ def test_galore(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1902,20 +1981,19 @@ def test_galore_extra_args(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_args="rank=64, update_proj_gap=100, scale=0.10",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_args="rank=64, update_proj_gap=100, scale=0.10",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1925,19 +2003,18 @@ def test_galore_layerwise(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_layerwise",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_layerwise",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1947,20 +2024,19 @@ def test_galore_layerwise_with_scheduler(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_layerwise",
-                lr_scheduler_type="cosine",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_layerwise",
+            lr_scheduler_type="cosine",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1970,19 +2046,18 @@ def test_galore_adamw_8bit(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_8bit",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
-
-            # Check this works
-            _ = trainer.train()
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_8bit",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -2086,23 +2161,22 @@ def test_galore_lr_display_without_scheduler(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            learning_rate = 1e-9
-            num_steps = 10
+        learning_rate = 1e-9
+        num_steps = 10
 
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=learning_rate,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
-            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=learning_rate,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-            # reflects displayed lr in trainer
-            self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
+        # reflects displayed lr in trainer
+        self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
 
     @require_galore_torch
     @require_torch_gpu
@@ -2112,49 +2186,48 @@ def test_galore_lr_display_with_scheduler(self):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            learning_rate = 2e-4
-            num_train_epochs = 2
-            num_warmup_steps = 5
+        learning_rate = 2e-4
+        num_train_epochs = 2
+        num_warmup_steps = 5
 
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                num_train_epochs=num_train_epochs,
-                learning_rate=learning_rate,
-                warmup_steps=num_warmup_steps,
-                lr_scheduler_type="cosine",
-                logging_steps=1,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            num_train_epochs=num_train_epochs,
+            learning_rate=learning_rate,
+            warmup_steps=num_warmup_steps,
+            lr_scheduler_type="cosine",
+            logging_steps=1,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # creating log history of trainer, results don't matter
-            trainer.train()
-            logs = trainer.state.log_history[1:][:-1]
+        # creating log history of trainer, results don't matter
+        trainer.train()
+        logs = trainer.state.log_history[1:][:-1]
 
-            # reach given learning rate peak and end with 0 lr
-            self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
-            self.assertTrue(logs[-1]["learning_rate"] == 0)
+        # reach given learning rate peak and end with 0 lr
+        self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
+        self.assertTrue(logs[-1]["learning_rate"] == 0)
 
-            # increasing and decreasing pattern of lrs
-            increasing_lrs = [
-                logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
-                for i in range(len(logs))
-                if i < num_warmup_steps - 2
-            ]
-            decreasing_lrs = [
-                logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
-                for i in range(len(logs) - 1)
-                if i >= num_warmup_steps - 2
-            ]
+        # increasing and decreasing pattern of lrs
+        increasing_lrs = [
+            logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
+            for i in range(len(logs))
+            if i < num_warmup_steps - 2
+        ]
+        decreasing_lrs = [
+            logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
+            for i in range(len(logs) - 1)
+            if i >= num_warmup_steps - 2
+        ]
 
-            self.assertTrue(all(increasing_lrs))
-            self.assertTrue(all(decreasing_lrs))
+        self.assertTrue(all(increasing_lrs))
+        self.assertTrue(all(decreasing_lrs))
 
-            # warm up steps << total steps
-            self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
+        # warm up steps << total steps
+        self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
 
     @require_torch_multi_accelerator
     def test_data_is_not_parallelized_when_model_is_parallel(self):
@@ -2162,148 +2235,117 @@ def test_data_is_not_parallelized_when_model_is_parallel(self):
         # Make the Trainer believe it's a parallelized model
         model.is_parallelizable = True
         model.model_parallel = True
-        args = TrainingArguments(
-            "./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none"
-        )
-        trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
-        # Check the Trainer was fooled
-        self.assertTrue(trainer.is_model_parallel)
-        self.assertEqual(trainer.args.n_gpu, 1)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(
+                tmp_dir, per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none"
+            )
+            trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
+            # Check the Trainer was fooled
+            self.assertTrue(trainer.is_model_parallel)
+            self.assertEqual(trainer.args.n_gpu, 1)
 
-        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
-        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
-        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
+            # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
+            self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
+            self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
+            self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
+            self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
 
     def test_evaluate(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracy(),
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-    def test_evaluate_with_batch_eval_metrics(self):
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.evaluate()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir)
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.evaluate()
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracyBatched(),
-            batch_eval_metrics=True,
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-        )
-        results = trainer.evaluate()
+            # With logits preprocess
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                compute_metrics=AlmostAccuracy(),
+                preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+                output_dir=tmp_dir,
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-    def test_evaluate_with_jit(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True)
-        results = trainer.evaluate()
+    def test_evaluate_with_batch_eval_metrics(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True
-        )
-        results = trainer.evaluate()
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                eval_len=66,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                output_dir=tmp_dir,
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracy(),
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-            jit_mode_eval=True,
-        )
-        results = trainer.evaluate()
+            # With logits preprocess
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+                output_dir=tmp_dir,
+            )
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-    @require_torch_bf16
-    @require_intel_extension_for_pytorch
-    def test_evaluate_with_ipex(self):
-        for mix_bf16 in [True, False]:
+    def test_evaluate_with_jit(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = get_regression_trainer(
-                a=1.5, b=2.5, use_ipex=True, compute_metrics=AlmostAccuracy(), bf16=mix_bf16, use_cpu=True
+                a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir
             )
             results = trainer.evaluate()
 
@@ -2316,13 +2358,7 @@ def test_evaluate_with_ipex(self):
 
             # With a number of elements not a round multiple of the batch size
             trainer = get_regression_trainer(
-                a=1.5,
-                b=2.5,
-                use_ipex=True,
-                eval_len=66,
-                compute_metrics=AlmostAccuracy(),
-                bf16=mix_bf16,
-                use_cpu=True,
+                a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir
             )
             results = trainer.evaluate()
 
@@ -2337,11 +2373,10 @@ def test_evaluate_with_ipex(self):
             trainer = get_regression_trainer(
                 a=1.5,
                 b=2.5,
-                use_ipex=True,
                 compute_metrics=AlmostAccuracy(),
                 preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-                bf16=mix_bf16,
-                use_cpu=True,
+                jit_mode_eval=True,
+                output_dir=tmp_dir,
             )
             results = trainer.evaluate()
 
@@ -2352,142 +2387,141 @@ def test_evaluate_with_ipex(self):
             expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
             self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-    def test_predict(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"])
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+    @require_torch_bf16
+    @require_intel_extension_for_pytorch
+    def test_evaluate_with_ipex(self):
+        for mix_bf16 in [True, False]:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    use_ipex=True,
+                    compute_metrics=AlmostAccuracy(),
+                    bf16=mix_bf16,
+                    use_cpu=True,
+                    output_dir=tmp_dir,
+                )
+                results = trainer.evaluate()
 
-    def test_predict_with_batch_eval_metrics(self):
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.predict(trainer.eval_dataset)
-        preds = results.predictions
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        gt = 1.5 * x + 2.5
-        self.assertTrue(np.allclose(preds, gt))
-        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
-        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        results = trainer.predict(trainer.eval_dataset)
-        preds = results.predictions
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
-        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
-        )
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+                x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+                pred = 1.5 * x + 2.5
+                expected_loss = ((pred - y) ** 2).mean()
+                self.assertAlmostEqual(results["eval_loss"], expected_loss)
+                expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+                self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            double_output=True,
-            label_names=["labels", "labels_2"],
-            compute_metrics=AlmostAccuracyBatched(),
-            batch_eval_metrics=True,
-        )
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+                # With a number of elements not a round multiple of the batch size
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    use_ipex=True,
+                    eval_len=66,
+                    compute_metrics=AlmostAccuracy(),
+                    bf16=mix_bf16,
+                    use_cpu=True,
+                    output_dir=tmp_dir,
+                )
+                results = trainer.evaluate()
 
-    def test_predict_with_jit(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(
-            a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], jit_mode_eval=True
-        )
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+                x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+                pred = 1.5 * x + 2.5
+                expected_loss = ((pred - y) ** 2).mean()
+                self.assertAlmostEqual(results["eval_loss"], expected_loss)
+                expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+                self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-    @require_torch_bf16
-    @require_intel_extension_for_pytorch
-    def test_predict_with_ipex(self):
-        for mix_bf16 in [True, False]:
-            trainer = get_regression_trainer(a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True)
+                # With logits preprocess
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    use_ipex=True,
+                    compute_metrics=AlmostAccuracy(),
+                    preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+                    bf16=mix_bf16,
+                    use_cpu=True,
+                    output_dir=tmp_dir,
+                )
+                results = trainer.evaluate()
+
+                x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+                pred = 1.5 * x + 2.5
+                expected_loss = ((pred - y) ** 2).mean()
+                self.assertAlmostEqual(results["eval_loss"], expected_loss)
+                expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+                self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+    def test_predict(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(a=1.5, b=2.5, output_dir=tmp_dir)
             preds = trainer.predict(trainer.eval_dataset).predictions
             x = trainer.eval_dataset.x
             self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
 
             # With a number of elements not a round multiple of the batch size
-            trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True)
+            trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, output_dir=tmp_dir)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+            # With more than one output of the model
+            trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, output_dir=tmp_dir)
             preds = trainer.predict(trainer.eval_dataset).predictions
             x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+            # With more than one output/label of the model
+            trainer = get_regression_trainer(
+                a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], output_dir=tmp_dir
+            )
+            outputs = trainer.predict(trainer.eval_dataset)
+            preds = outputs.predictions
+            labels = outputs.label_ids
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+            self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+            self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+
+    def test_predict_with_batch_eval_metrics(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir
+            )
+            results = trainer.predict(trainer.eval_dataset)
+            preds = results.predictions
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+            gt = 1.5 * x + 2.5
+            self.assertTrue(np.allclose(preds, gt))
+            expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+            self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                eval_len=66,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                output_dir=tmp_dir,
+            )
+            results = trainer.predict(trainer.eval_dataset)
+            preds = results.predictions
+            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
             self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+            self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
 
             # With more than one output of the model
             trainer = get_regression_trainer(
-                a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True
+                a=1.5,
+                b=2.5,
+                double_output=True,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                output_dir=tmp_dir,
             )
             preds = trainer.predict(trainer.eval_dataset).predictions
             x = trainer.eval_dataset.x
@@ -2501,9 +2535,9 @@ def test_predict_with_ipex(self):
                 b=2.5,
                 double_output=True,
                 label_names=["labels", "labels_2"],
-                use_ipex=True,
-                bf16=mix_bf16,
-                use_cpu=True,
+                compute_metrics=AlmostAccuracyBatched(),
+                batch_eval_metrics=True,
+                output_dir=tmp_dir,
             )
             outputs = trainer.predict(trainer.eval_dataset)
             preds = outputs.predictions
@@ -2515,41 +2549,134 @@ def test_predict_with_ipex(self):
             self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
             self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
 
+    def test_predict_with_jit(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True, output_dir=tmp_dir)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+            # With a number of elements not a round multiple of the batch size
+            trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True, output_dir=tmp_dir)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+            # With more than one output of the model
+            trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True, output_dir=tmp_dir)
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+            # With more than one output/label of the model
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                double_output=True,
+                label_names=["labels", "labels_2"],
+                jit_mode_eval=True,
+                output_dir=tmp_dir,
+            )
+            outputs = trainer.predict(trainer.eval_dataset)
+            preds = outputs.predictions
+            labels = outputs.label_ids
+            x = trainer.eval_dataset.x
+            self.assertEqual(len(preds), 2)
+            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+            self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+            self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+
+    @require_torch_bf16
+    @require_intel_extension_for_pytorch
+    def test_predict_with_ipex(self):
+        for mix_bf16 in [True, False]:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                trainer = get_regression_trainer(
+                    a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+                )
+                preds = trainer.predict(trainer.eval_dataset).predictions
+                x = trainer.eval_dataset.x
+                self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+                # With a number of elements not a round multiple of the batch size
+                trainer = get_regression_trainer(
+                    a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+                )
+                preds = trainer.predict(trainer.eval_dataset).predictions
+                x = trainer.eval_dataset.x
+                self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+                # With more than one output of the model
+                trainer = get_regression_trainer(
+                    a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+                )
+                preds = trainer.predict(trainer.eval_dataset).predictions
+                x = trainer.eval_dataset.x
+                self.assertEqual(len(preds), 2)
+                self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+                self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+                # With more than one output/label of the model
+                trainer = get_regression_trainer(
+                    a=1.5,
+                    b=2.5,
+                    double_output=True,
+                    label_names=["labels", "labels_2"],
+                    use_ipex=True,
+                    bf16=mix_bf16,
+                    use_cpu=True,
+                    output_dir=tmp_dir,
+                )
+                outputs = trainer.predict(trainer.eval_dataset)
+                preds = outputs.predictions
+                labels = outputs.label_ids
+                x = trainer.eval_dataset.x
+                self.assertEqual(len(preds), 2)
+                self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+                self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+                self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+                self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+
     def test_dynamic_shapes(self):
         eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
         model = RegressionModel(a=2, b=1)
-        args = TrainingArguments("./regression", report_to="none")
-        trainer = Trainer(model, args, eval_dataset=eval_dataset)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, report_to="none")
+            trainer = Trainer(model, args, eval_dataset=eval_dataset)
 
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
+            # Check evaluation can run to completion
+            _ = trainer.evaluate()
 
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            # Check predictions
+            preds = trainer.predict(eval_dataset)
+            for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+                self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            for expected, seen in zip(eval_dataset.xs, preds.predictions):
+                self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
         # Same tests with eval accumulation
-        args = TrainingArguments("./regression", eval_accumulation_steps=2, report_to="none")
-        trainer = Trainer(model, args, eval_dataset=eval_dataset)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, eval_accumulation_steps=2, report_to="none")
+            trainer = Trainer(model, args, eval_dataset=eval_dataset)
 
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
+            # Check evaluation can run to completion
+            _ = trainer.evaluate()
 
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            # Check predictions
+            preds = trainer.predict(eval_dataset)
+            for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+                self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+            for expected, seen in zip(eval_dataset.xs, preds.predictions):
+                self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
+                self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
 
     def test_log_level(self):
         # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
@@ -2558,109 +2685,109 @@ def test_log_level(self):
 
         # test with the default log_level - should be the same as before and thus we test depending on is_info
         is_info = logging.get_verbosity() <= 20
-        with CaptureLogger(logger) as cl:
-            trainer = get_regression_trainer()
-            trainer.train()
-        if is_info:
-            self.assertIn(log_info_string, cl.out)
-        else:
-            self.assertNotIn(log_info_string, cl.out)
 
-        with LoggingLevel(logging.INFO):
-            # test with low log_level - lower than info
+        with tempfile.TemporaryDirectory() as tmp_dir:
             with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="debug")
+                trainer = get_regression_trainer(output_dir=tmp_dir)
                 trainer.train()
-            self.assertIn(log_info_string, cl.out)
+            if is_info:
+                self.assertIn(log_info_string, cl.out)
+            else:
+                self.assertNotIn(log_info_string, cl.out)
 
-        with LoggingLevel(logging.INFO):
-            # test with high log_level - should be quiet
-            with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="error")
-                trainer.train()
-            self.assertNotIn(log_info_string, cl.out)
+            with LoggingLevel(logging.INFO):
+                # test with low log_level - lower than info
+                with CaptureLogger(logger) as cl:
+                    trainer = get_regression_trainer(log_level="debug", output_dir=tmp_dir)
+                    trainer.train()
+                self.assertIn(log_info_string, cl.out)
+
+            with LoggingLevel(logging.INFO):
+                # test with high log_level - should be quiet
+                with CaptureLogger(logger) as cl:
+                    trainer = get_regression_trainer(log_level="error", output_dir=tmp_dir)
+                    trainer.train()
+                self.assertNotIn(log_info_string, cl.out)
 
     def test_save_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5)
+        trainer.train()
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size))
 
         # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False)
+        trainer.train()
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False)
 
     @require_safetensors
     def test_safe_checkpoints(self):
         for save_safetensors in [True, False]:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
-                )
+            tmp_dir = self.get_auto_remove_tmp_dir()
+            trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors)
+            trainer.train()
+            self.check_saved_checkpoints(
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
+            )
 
             # With a regular model that is not a PreTrainedModel
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(
-                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
-                )
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
-                )
-
-    def test_load_best_model_with_save(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_dir = self.get_auto_remove_tmp_dir()
             trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                save_steps=5,
-                evaluation_strategy="steps",
-                eval_steps=5,
-                max_steps=9,
+                output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
             )
             trainer.train()
-            # Check that we have the last known step:
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
-            ), f"Could not find checkpoint-{trainer.state.max_steps}"
-            # And then check the last step
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
+            self.check_saved_checkpoints(
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
+            )
+
+    def test_load_best_model_with_save(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(
+            output_dir=tmp_dir,
+            save_steps=5,
+            evaluation_strategy="steps",
+            eval_steps=5,
+            max_steps=9,
+        )
+        trainer.train()
+        # Check that we have the last known step:
+        assert os.path.exists(
+            os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
+        ), f"Could not find checkpoint-{trainer.state.max_steps}"
+        # And then check the last step
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
 
         # Now test that using a limit works
         # Should result in:
         # - save at step 5 (but is deleted)
         # - save at step 10 (loaded in at the end when `load_best_model=True`)
         # - save at step 11
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                save_steps=5,
-                evaluation_strategy="steps",
-                eval_steps=5,
-                load_best_model_at_end=True,
-                save_total_limit=2,
-                max_steps=11,
-            )
-            trainer.train()
-            # Check that we have the last known step:
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
-            # And then check the last multiple
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
-            # Finally check that we don't have an old one
-            assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
-
-            # Finally check that the right model was loaded in, checkpoint-10
-            # this goes by the last `eval` step check to do so, so it won't be
-            # the last model *saved*
-            model_state = trainer.model.state_dict()
-            final_model_weights = safetensors.torch.load_file(
-                os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
-            )
-            for k, v in model_state.items():
-                assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(
+            output_dir=tmp_dir,
+            save_steps=5,
+            evaluation_strategy="steps",
+            eval_steps=5,
+            load_best_model_at_end=True,
+            save_total_limit=2,
+            max_steps=11,
+        )
+        trainer.train()
+        # Check that we have the last known step:
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11"
+        # And then check the last multiple
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10"
+        # Finally check that we don't have an old one
+        assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
+
+        # Finally check that the right model was loaded in, checkpoint-10
+        # this goes by the last `eval` step check to do so, so it won't be
+        # the last model *saved*
+        model_state = trainer.model.state_dict()
+        final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors"))
+        for k, v in model_state.items():
+            assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
 
     @require_torch_multi_accelerator
     def test_run_seq2seq_double_train_wrap_once(self):
@@ -2668,7 +2795,7 @@ def test_run_seq2seq_double_train_wrap_once(self):
         # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
         # example DataParallel(DataParallel(model))
 
-        trainer = get_regression_trainer()
+        trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
         trainer.train()
         model_wrapped_before = trainer.model_wrapped
         trainer.train()
@@ -2681,95 +2808,96 @@ def test_can_resume_training(self):
         # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
         # won't be the same since the training dataloader is shuffled).
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "logging_steps": 5,
-            }
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": tmp_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "logging_steps": 5,
+        }
+        trainer = get_regression_trainer(**kwargs)
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+        state = dataclasses.asdict(trainer.state)
 
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            # Reinitialize trainer
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        # Now check with a later checkpoint that it also works when we span over one epoch
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
 
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
         # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "pretrained": False,
-            }
-
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": tmp_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "pretrained": False,
+        }
 
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        trainer = get_regression_trainer(**kwargs)
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+        state = dataclasses.asdict(trainer.state)
 
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
+
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
+
+        # Now check with a later checkpoint that it also works when we span over one epoch
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
+
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
+
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
         # Now check failures
 
         # 1. fail to find a bogus checkpoint
-        trainer = get_regression_trainer()
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
         with self.assertRaises(Exception) as context:
             trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
         self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
 
         # 2. fail to find any checkpoint - due a fresh output_dir
-        output_dir2 = self.get_auto_remove_tmp_dir()
-        trainer = get_regression_trainer(output_dir=output_dir2)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
         with self.assertRaises(Exception) as context:
             trainer.train(resume_from_checkpoint=True)
         self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
@@ -3185,10 +3313,11 @@ def test_trainer_eval_mrpc(self):
         )
         eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
 
-        training_args = TrainingArguments(output_dir="./examples", use_cpu=True, report_to="none")
-        trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
-        result = trainer.evaluate()
-        self.assertLess(result["eval_loss"], 0.2)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = TrainingArguments(output_dir=tmp_dir, use_cpu=True, report_to="none")
+            trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
+            result = trainer.evaluate()
+            self.assertLess(result["eval_loss"], 0.2)
 
     @slow
     def test_trainer_eval_multiple(self):
@@ -3202,23 +3331,24 @@ def test_trainer_eval_multiple(self):
         )
         for example in dataset.examples:
             example["labels"] = example["input_ids"]
-        training_args = TrainingArguments(
-            output_dir="./examples",
-            use_cpu=True,
-            per_device_eval_batch_size=1,
-            report_to="none",
-        )
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            eval_dataset={
-                "data1": dataset,
-                "data2": dataset,
-            },
-        )
-        result = trainer.evaluate()
-        self.assertIn("eval_data1_loss", result)
-        self.assertIn("eval_data2_loss", result)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = TrainingArguments(
+                output_dir=tmp_dir,
+                use_cpu=True,
+                per_device_eval_batch_size=1,
+                report_to="none",
+            )
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                eval_dataset={
+                    "data1": dataset,
+                    "data2": dataset,
+                },
+            )
+            result = trainer.evaluate()
+            self.assertIn("eval_data1_loss", result)
+            self.assertIn("eval_data2_loss", result)
 
     @slow
     def test_trainer_eval_lm(self):
@@ -3237,14 +3367,15 @@ def test_training_iterable_dataset(self):
         # Adding one column not used by the model should have no impact
         train_dataset = SampleIterableDataset(label_names=["labels", "extra"])
 
-        args = RegressionTrainingArguments(output_dir="./examples", max_steps=4)
-        trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
-        trainer.train()
-        self.assertEqual(trainer.state.global_step, 4)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = RegressionTrainingArguments(output_dir=tmp_dir, max_steps=4)
+            trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
+            trainer.train()
+            self.assertEqual(trainer.state.global_step, 4)
 
-        loader = trainer.get_train_dataloader()
-        self.assertIsInstance(loader, torch.utils.data.DataLoader)
-        self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
+            loader = trainer.get_train_dataloader()
+            self.assertIsInstance(loader, torch.utils.data.DataLoader)
+            self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
 
     def test_evaluation_iterable_dataset(self):
         config = RegressionModelConfig(a=1.5, b=2.5)
@@ -3252,61 +3383,70 @@ def test_evaluation_iterable_dataset(self):
         # Adding one column not used by the model should have no impact
         eval_dataset = SampleIterableDataset(label_names=["labels", "extra"])
 
-        args = RegressionTrainingArguments(output_dir="./examples")
-        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = RegressionTrainingArguments(output_dir=tmp_dir)
+            trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
+            results = trainer.evaluate()
 
-        x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
-        # With a number of elements not a round multiple of the batch size
-        eval_dataset = SampleIterableDataset(length=66)
-        results = trainer.evaluate(eval_dataset)
+            # With a number of elements not a round multiple of the batch size
+            eval_dataset = SampleIterableDataset(length=66)
+            results = trainer.evaluate(eval_dataset)
 
-        x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+            x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
+            pred = 1.5 * x + 2.5
+            expected_loss = ((pred - y) ** 2).mean()
+            self.assertAlmostEqual(results["eval_loss"], expected_loss)
+            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
 
     def test_predict_iterable_dataset(self):
         config = RegressionModelConfig(a=1.5, b=2.5)
         model = RegressionPreTrainedModel(config)
         eval_dataset = SampleIterableDataset()
 
-        args = RegressionTrainingArguments(output_dir="./examples")
-        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = RegressionTrainingArguments(output_dir=tmp_dir)
+            trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
 
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = eval_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            preds = trainer.predict(trainer.eval_dataset).predictions
+            x = eval_dataset.dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
 
-        # With a number of elements not a round multiple of the batch size
-        # Adding one column not used by the model should have no impact
-        test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
-        preds = trainer.predict(test_dataset).predictions
-        x = test_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+            # With a number of elements not a round multiple of the batch size
+            # Adding one column not used by the model should have no impact
+            test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
+            preds = trainer.predict(test_dataset).predictions
+            x = test_dataset.dataset.x
+            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
 
     def test_num_train_epochs_in_training(self):
         # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
         # It should give 1 update step for each epoch.
-        trainer = get_regression_trainer(
-            max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
-        )
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, 3)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                max_steps=3,
+                train_len=64,
+                per_device_train_batch_size=16,
+                gradient_accumulation_steps=5,
+                output_dir=tmp_dir,
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, 3)
 
-        # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
-        # len(train_dl) < gradient_accumulation_steps.
-        trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, int(self.n_epochs))
+            # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
+            # len(train_dl) < gradient_accumulation_steps.
+            trainer = get_regression_trainer(
+                train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5, output_dir=tmp_dir
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, int(self.n_epochs))
 
     def test_early_stopping_callback(self):
         # early stopping stops training before num_training_epochs
@@ -3344,23 +3484,41 @@ def test_early_stopping_callback(self):
             except AssertionError:
                 self.assertEqual(trainer.state.global_step, 0)
 
+        # even if load_best_model_at_end is False, `best_model_checkpoint` should be set
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                num_train_epochs=20,
+                gradient_accumulation_steps=1,
+                per_device_train_batch_size=16,
+                load_best_model_at_end=False,
+                eval_strategy=IntervalStrategy.EPOCH,
+                save_strategy=IntervalStrategy.EPOCH,
+                compute_metrics=AlmostAccuracy(),
+                metric_for_best_model="accuracy",
+            )
+            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
+            train_output = trainer.train()
+            self.assertIsNotNone(trainer.state.best_model_checkpoint)
+
     def test_flos_extraction(self):
-        trainer = get_regression_trainer(learning_rate=0.1)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
 
-        def assert_flos_extraction(trainer, wrapped_model_to_check):
-            self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
-            self.assertGreaterEqual(
-                getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
-            )
+            def assert_flos_extraction(trainer, wrapped_model_to_check):
+                self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
+                self.assertGreaterEqual(
+                    getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
+                )
 
-        # with plain model
-        assert_flos_extraction(trainer, trainer.model)
+            # with plain model
+            assert_flos_extraction(trainer, trainer.model)
 
-        # with enforced DataParallel
-        assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
+            # with enforced DataParallel
+            assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
 
-        trainer.train()
-        self.assertTrue(isinstance(trainer.state.total_flos, float))
+            trainer.train()
+            self.assertTrue(isinstance(trainer.state.total_flos, float))
 
     def check_checkpoint_deletion(self, trainer, output_dir, expected):
         # Make fake checkpoints
@@ -3452,13 +3610,14 @@ def check_mem_metrics(self, trainer, check_func):
             check_func("test_mem_gpu_alloc_delta", metrics)
 
     def test_mem_metrics(self):
-        # with mem metrics enabled
-        trainer = get_regression_trainer(skip_memory_metrics=False)
-        self.check_mem_metrics(trainer, self.assertIn)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # with mem metrics enabled
+            trainer = get_regression_trainer(skip_memory_metrics=False, output_dir=tmp_dir)
+            self.check_mem_metrics(trainer, self.assertIn)
 
-        # with mem metrics disabled
-        trainer = get_regression_trainer(skip_memory_metrics=True)
-        self.check_mem_metrics(trainer, self.assertNotIn)
+            # with mem metrics disabled
+            trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
+            self.check_mem_metrics(trainer, self.assertNotIn)
 
     @require_torch_accelerator
     def test_fp16_full_eval(self):
@@ -3467,55 +3626,60 @@ def test_fp16_full_eval(self):
         debug = 0
         n_gpus = backend_device_count(torch_device)
 
-        bs = 8
-        eval_len = 16 * n_gpus
-        # make the params somewhat big so that there will be enough RAM consumed to be able to
-        # measure things. We should get about 64KB for a+b in fp32
-        a = torch.ones(1000, bs) + 0.001
-        b = torch.ones(1000, bs) - 0.001
-
-        # 1. with fp16_full_eval disabled
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
-        metrics = trainer.evaluate()
-        del trainer
-        gc.collect()
-
-        fp32_init = metrics["init_mem_gpu_alloc_delta"]
-        fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
-
-        if debug:
-            print(f"fp32_init {fp32_init}")
-            print(f"fp32_eval {fp32_eval}")
-
-        # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
-        # perfect world: fp32_init == 64<<10
-        self.assertGreater(fp32_init, 59_000)
-        # after eval should be no extra memory allocated - with a small margin (other than the peak
-        # memory consumption for the forward calculation that gets recovered)
-        # perfect world: fp32_eval == close to zero
-        self.assertLess(fp32_eval, 5_000)
-
-        # 2. with fp16_full_eval enabled
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False)
-        metrics = trainer.evaluate()
-        fp16_init = metrics["init_mem_gpu_alloc_delta"]
-        fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
-
-        if debug:
-            print(f"fp16_init {fp16_init}")
-            print(f"fp16_eval {fp16_eval}")
-
-        # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
-        # perfect world: fp16_init == close to zero
-        self.assertLess(fp16_init, 5_000)
-        # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
-        # perfect world: fp32_init == 32<<10
-        self.assertGreater(fp16_eval, 27_000)
-
-        # 3. relative comparison fp32 vs full fp16
-        # should be about half of fp16_init
-        # perfect world: fp32_init/2 == fp16_eval
-        self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            bs = 8
+            eval_len = 16 * n_gpus
+            # make the params somewhat big so that there will be enough RAM consumed to be able to
+            # measure things. We should get about 64KB for a+b in fp32
+            a = torch.ones(1000, bs) + 0.001
+            b = torch.ones(1000, bs) - 0.001
+
+            # 1. with fp16_full_eval disabled
+            trainer = get_regression_trainer(
+                a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir
+            )
+            metrics = trainer.evaluate()
+            del trainer
+            gc.collect()
+
+            fp32_init = metrics["init_mem_gpu_alloc_delta"]
+            fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+            if debug:
+                print(f"fp32_init {fp32_init}")
+                print(f"fp32_eval {fp32_eval}")
+
+            # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
+            # perfect world: fp32_init == 64<<10
+            self.assertGreater(fp32_init, 59_000)
+            # after eval should be no extra memory allocated - with a small margin (other than the peak
+            # memory consumption for the forward calculation that gets recovered)
+            # perfect world: fp32_eval == close to zero
+            self.assertLess(fp32_eval, 5_000)
+
+            # 2. with fp16_full_eval enabled
+            trainer = get_regression_trainer(
+                a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir
+            )
+            metrics = trainer.evaluate()
+            fp16_init = metrics["init_mem_gpu_alloc_delta"]
+            fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+            if debug:
+                print(f"fp16_init {fp16_init}")
+                print(f"fp16_eval {fp16_eval}")
+
+            # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
+            # perfect world: fp16_init == close to zero
+            self.assertLess(fp16_init, 5_000)
+            # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
+            # perfect world: fp32_init == 32<<10
+            self.assertGreater(fp16_eval, 27_000)
+
+            # 3. relative comparison fp32 vs full fp16
+            # should be about half of fp16_init
+            # perfect world: fp32_init/2 == fp16_eval
+            self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
 
     @require_non_xpu
     @require_torch_non_multi_gpu
@@ -3534,30 +3698,31 @@ def test_torchdynamo_full_eval(self):
         a = torch.ones(1000, bs) + 0.001
         b = torch.ones(1000, bs) - 0.001
 
-        # 1. Default - without TorchDynamo
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len)
-        metrics = trainer.evaluate()
-        original_eval_loss = metrics["eval_loss"]
-        del trainer
-
-        # 2. TorchDynamo eager
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager")
-        metrics = trainer.evaluate()
-        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
-        del trainer
-        torchdynamo.reset()
-
-        # 3. TorchDynamo nvfuser
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser")
-        metrics = trainer.evaluate()
-        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
-        torchdynamo.reset()
-
-        # 4. TorchDynamo fx2trt
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt")
-        metrics = trainer.evaluate()
-        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
-        torchdynamo.reset()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # 1. Default - without TorchDynamo
+            trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, output_dir=tmp_dir)
+            metrics = trainer.evaluate()
+            original_eval_loss = metrics["eval_loss"]
+            del trainer
+
+            # 2. TorchDynamo eager
+            trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager", output_dir=tmp_dir)
+            metrics = trainer.evaluate()
+            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
+            del trainer
+            torchdynamo.reset()
+
+            # 3. TorchDynamo nvfuser
+            trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser", output_dir=tmp_dir)
+            metrics = trainer.evaluate()
+            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
+            torchdynamo.reset()
+
+            # 4. TorchDynamo fx2trt
+            trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt", output_dir=tmp_dir)
+            metrics = trainer.evaluate()
+            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
+            torchdynamo.reset()
 
     @unittest.skip(reason="torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
     @require_torch_non_multi_gpu
@@ -3606,30 +3771,31 @@ def forward(self, x):
         del trainer
 
         # 2. TorchDynamo nvfuser
-        a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
-        a.grad = None
-        args = TrainingArguments(output_dir="None", torchdynamo="nvfuser")
-        trainer = CustomTrainer(model=mod, args=args)
-        # warmup
-        for _ in range(10):
-            loss = trainer.training_step(mod, {"x": a})
-
-        # resets
-        gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
+            a.grad = None
+            args = TrainingArguments(output_dir=tmp_dir, torchdynamo="nvfuser")
+            trainer = CustomTrainer(model=mod, args=args)
+            # warmup
+            for _ in range(10):
+                loss = trainer.training_step(mod, {"x": a})
+
+            # resets
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
 
-        loss = trainer.training_step(mod, {"x": a})
-        peak_mem = torch.cuda.max_memory_allocated()
-        torchdynamo.reset()
-        del trainer
+            loss = trainer.training_step(mod, {"x": a})
+            peak_mem = torch.cuda.max_memory_allocated()
+            torchdynamo.reset()
+            del trainer
 
-        # Functional check
-        self.assertAlmostEqual(loss, orig_loss)
+            # Functional check
+            self.assertAlmostEqual(loss, orig_loss)
 
-        # AOT Autograd recomputaion and nvfuser recomputation optimization
-        # aggressively fuses the operations and reduce the memory footprint.
-        self.assertGreater(orig_peak_mem, peak_mem * 2)
+            # AOT Autograd recomputaion and nvfuser recomputation optimization
+            # aggressively fuses the operations and reduce the memory footprint.
+            self.assertGreater(orig_peak_mem, peak_mem * 2)
 
     @require_torch_accelerator
     @require_torch_bf16
@@ -3648,48 +3814,53 @@ def test_bf16_full_eval(self):
         a = torch.ones(1000, bs) + 0.001
         b = torch.ones(1000, bs) - 0.001
 
-        # 1. with bf16_full_eval disabled
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
-        metrics = trainer.evaluate()
-        del trainer
-        gc.collect()
-
-        fp32_init = metrics["init_mem_gpu_alloc_delta"]
-        fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
-
-        if debug:
-            print(f"fp32_init {fp32_init}")
-            print(f"fp32_eval {fp32_eval}")
-
-        # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
-        # perfect world: fp32_init == 64<<10
-        self.assertGreater(fp32_init, 59_000)
-        # after eval should be no extra memory allocated - with a small margin (other than the peak
-        # memory consumption for the forward calculation that gets recovered)
-        # perfect world: fp32_eval == close to zero
-        self.assertLess(fp32_eval, 5_000)
-
-        # 2. with bf16_full_eval enabled
-        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False)
-        metrics = trainer.evaluate()
-        bf16_init = metrics["init_mem_gpu_alloc_delta"]
-        bf16_eval = metrics["eval_mem_gpu_alloc_delta"]
-
-        if debug:
-            print(f"bf16_init {bf16_init}")
-            print(f"bf16_eval {bf16_eval}")
-
-        # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
-        # perfect world: bf16_init == close to zero
-        self.assertLess(bf16_init, 5_000)
-        # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
-        # perfect world: fp32_init == 32<<10
-        self.assertGreater(bf16_eval, 27_000)
-
-        # 3. relative comparison fp32 vs full bf16
-        # should be about half of bf16_init
-        # perfect world: fp32_init/2 == bf16_eval
-        self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # 1. with bf16_full_eval disabled
+            trainer = get_regression_trainer(
+                a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir
+            )
+            metrics = trainer.evaluate()
+            del trainer
+            gc.collect()
+
+            fp32_init = metrics["init_mem_gpu_alloc_delta"]
+            fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+            if debug:
+                print(f"fp32_init {fp32_init}")
+                print(f"fp32_eval {fp32_eval}")
+
+            # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
+            # perfect world: fp32_init == 64<<10
+            self.assertGreater(fp32_init, 59_000)
+            # after eval should be no extra memory allocated - with a small margin (other than the peak
+            # memory consumption for the forward calculation that gets recovered)
+            # perfect world: fp32_eval == close to zero
+            self.assertLess(fp32_eval, 5_000)
+
+            # 2. with bf16_full_eval enabled
+            trainer = get_regression_trainer(
+                a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir
+            )
+            metrics = trainer.evaluate()
+            bf16_init = metrics["init_mem_gpu_alloc_delta"]
+            bf16_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+            if debug:
+                print(f"bf16_init {bf16_init}")
+                print(f"bf16_eval {bf16_eval}")
+
+            # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
+            # perfect world: bf16_init == close to zero
+            self.assertLess(bf16_init, 5_000)
+            # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
+            # perfect world: fp32_init == 32<<10
+            self.assertGreater(bf16_eval, 27_000)
+
+            # 3. relative comparison fp32 vs full bf16
+            # should be about half of bf16_init
+            # perfect world: fp32_init/2 == bf16_eval
+            self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000)
 
     def test_no_wd_param_group(self):
         model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
@@ -3757,9 +3928,7 @@ def test_accelerator_config_empty(self):
             eval_dataset = SampleIterableDataset()
 
             # Leaves one option as something *not* basic
-            args = RegressionTrainingArguments(
-                output_dir=tmp_dir,
-            )
+            args = RegressionTrainingArguments(output_dir=tmp_dir)
             trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
             self.assertEqual(trainer.accelerator.split_batches, False)
             self.assertEqual(trainer.accelerator.dispatch_batches, None)
@@ -3788,10 +3957,7 @@ def test_accelerator_config_from_dict(self):
                 accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True}
 
             # Leaves all options as something *not* basic
-            args = RegressionTrainingArguments(
-                output_dir=tmp_dir,
-                accelerator_config=accelerator_config,
-            )
+            args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config)
             trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
             self.assertEqual(trainer.accelerator.split_batches, True)
             self.assertEqual(trainer.accelerator.dispatch_batches, True)
@@ -4052,11 +4218,12 @@ def test_eval_use_gather_object(self):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionDictModel()
-        args = TrainingArguments("./regression", report_to="none", eval_use_gather_object=True)
-        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(tmp_dir, report_to="none", eval_use_gather_object=True)
+            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+            trainer.train()
+            _ = trainer.evaluate()
+            _ = trainer.predict(eval_dataset)
 
     def test_trainer_saves_tokenizer(self):
         MODEL_ID = "google-bert/bert-base-uncased"
@@ -4220,7 +4387,9 @@ def test_save_best_checkpoint(self):
                     total=total,
                 )
 
-        # Case 3: Metric name not provided; throw error.
+    def test_metric_for_best_model_behavior(self):
+        # Case 1: Metric name not provided when `save_strategy == "best"`.
+        # Should raise ValueError.
         with tempfile.TemporaryDirectory() as tmpdir:
             with self.assertRaises(ValueError) as context:
                 trainer = get_regression_trainer(
@@ -4232,9 +4401,22 @@ def test_save_best_checkpoint(self):
                     save_strategy="best",
                     compute_metrics=AlmostAccuracy(),
                 )
-
             self.assertIn("`args.metric_for_best_model` must be provided", str(context.exception))
 
+        # Case 2: Metric name not provided when `load_best_model_at_end == True`.
+        # `metric_for_best_model` should be set to `"loss"` by default.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_strategy="steps",
+                save_strategy="steps",
+                load_best_model_at_end=True,
+            )
+            self.assertTrue(trainer.args.metric_for_best_model == "loss")
+
 
 @require_torch
 @is_staging_test
@@ -4683,22 +4865,22 @@ def hp_name(trial):
 
     optim_test_params = [
         (
-            TrainingArguments(optim=OptimizerNames.ADAMW_HF, output_dir="None"),
+            OptimizerNames.ADAMW_HF,
             transformers.optimization.AdamW,
             default_adam_kwargs,
         ),
         (
-            TrainingArguments(optim=OptimizerNames.ADAMW_HF.value, output_dir="None"),
+            OptimizerNames.ADAMW_HF.value,
             transformers.optimization.AdamW,
             default_adam_kwargs,
         ),
         (
-            TrainingArguments(optim=OptimizerNames.ADAMW_TORCH, output_dir="None"),
+            OptimizerNames.ADAMW_TORCH,
             torch.optim.AdamW,
             default_adam_kwargs,
         ),
         (
-            TrainingArguments(optim=OptimizerNames.ADAFACTOR, output_dir="None"),
+            OptimizerNames.ADAFACTOR,
             transformers.optimization.Adafactor,
             {
                 "scale_parameter": False,
@@ -4713,7 +4895,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"),
+                OptimizerNames.ADAMW_APEX_FUSED,
                 apex.optimizers.FusedAdam,
                 default_adam_kwargs,
             )
@@ -4724,7 +4906,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
+                OptimizerNames.ADAMW_BNB,
                 bnb.optim.AdamW,
                 default_adam_kwargs,
             )
@@ -4732,7 +4914,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
+                OptimizerNames.ADAMW_8BIT,
                 bnb.optim.AdamW,
                 default_adam_kwargs,
             )
@@ -4740,7 +4922,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
+                OptimizerNames.PAGED_ADAMW,
                 bnb.optim.AdamW,
                 default_adam_kwargs,
             )
@@ -4748,7 +4930,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
+                OptimizerNames.PAGED_ADAMW_8BIT,
                 bnb.optim.AdamW,
                 default_adam_kwargs,
             )
@@ -4756,7 +4938,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
+                OptimizerNames.LION,
                 bnb.optim.Lion,
                 default_lion_kwargs,
             )
@@ -4764,7 +4946,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
+                OptimizerNames.LION_8BIT,
                 bnb.optim.Lion,
                 default_lion_kwargs,
             )
@@ -4772,7 +4954,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
+                OptimizerNames.PAGED_LION_8BIT,
                 bnb.optim.Lion,
                 default_lion_kwargs,
             )
@@ -4781,28 +4963,28 @@ def hp_name(trial):
         if version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.44.0"):
             optim_test_params.append(
                 (
-                    TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None"),
+                    OptimizerNames.ADEMAMIX,
                     bnb.optim.AdEMAMix,
                     default_ademamix_kwargs,
                 )
             )
             optim_test_params.append(
                 (
-                    TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None"),
+                    OptimizerNames.ADEMAMIX_8BIT,
                     bnb.optim.AdEMAMix,
                     default_ademamix_kwargs,
                 )
             )
             optim_test_params.append(
                 (
-                    TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None"),
+                    OptimizerNames.PAGED_ADEMAMIX_8BIT,
                     bnb.optim.AdEMAMix,
                     default_ademamix_kwargs,
                 )
             )
             optim_test_params.append(
                 (
-                    TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None"),
+                    OptimizerNames.PAGED_ADEMAMIX,
                     bnb.optim.AdEMAMix,
                     default_ademamix_kwargs,
                 )
@@ -4813,7 +4995,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"),
+                OptimizerNames.ADAMW_ANYPRECISION,
                 torchdistx.optimizers.AnyPrecisionAdamW,
                 dict(default_adam_kwargs, **default_anyprecision_kwargs),
             )
@@ -4823,7 +5005,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                TrainingArguments(optim=OptimizerNames.ADAMW_TORCH_4BIT, output_dir="None"),
+                OptimizerNames.ADAMW_TORCH_4BIT,
                 torchao.prototype.low_bit_optim.AdamW4bit,
                 default_adam_kwargs,
             )
@@ -4843,12 +5025,13 @@ def check_optim_and_kwargs(self, training_args: TrainingArguments, expected_cls,
             self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.")
 
     @parameterized.expand(optim_test_params, skip_on_empty=True)
-    def test_optim_supported(self, training_args: TrainingArguments, expected_cls, expected_kwargs):
-        # exercises all the valid --optim options
-        self.check_optim_and_kwargs(training_args, expected_cls, expected_kwargs)
+    def test_optim_supported(self, optim: str, expected_cls, expected_kwargs):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(output_dir=tmp_dir, optim=optim)
 
-        trainer = get_regression_trainer(**training_args.to_dict())
-        trainer.train()
+            # exercises all the valid --optim options
+            self.check_optim_and_kwargs(trainer.args, expected_cls, expected_kwargs)
+            trainer.train()
 
     def test_fused_adam(self):
         # Pretend that apex is installed and mock apex.optimizers.FusedAdam exists.
@@ -4861,21 +5044,23 @@ def test_fused_adam(self):
             "apex.optimizers": mock.optimizers,
             "apex.optimizers.FusedAdam": mock.optimizers.FusedAdam,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"),
-                mock.optimizers.FusedAdam,
-                default_adam_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir),
+                    mock.optimizers.FusedAdam,
+                    default_adam_kwargs,
+                )
 
     def test_fused_adam_no_apex(self):
-        args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir)
 
-        # Pretend that apex does not exist, even if installed. By setting apex to None, importing
-        # apex will fail even if apex is installed.
-        with patch.dict("sys.modules", {"apex.optimizers": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that apex does not exist, even if installed. By setting apex to None, importing
+            # apex will fail even if apex is installed.
+            with patch.dict("sys.modules", {"apex.optimizers": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_adam8bit(self):
         # Pretend that Bits and Bytes is installed and mock bnb.optim.Adam8bit exists.
@@ -4888,12 +5073,13 @@ def test_bnb_adam8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
-                mock.optim.AdamW,
-                default_adam_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir),
+                    mock.optim.AdamW,
+                    default_adam_kwargs,
+                )
 
     def test_bnb_paged_adam8bit_alias(self):
         mock = Mock()
@@ -4902,12 +5088,13 @@ def test_bnb_paged_adam8bit_alias(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
-                mock.optim.AdamW,
-                default_adam_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir=tmp_dir),
+                    mock.optim.AdamW,
+                    default_adam_kwargs,
+                )
 
     def test_bnb_paged_adam(self):
         mock = Mock()
@@ -4916,12 +5103,13 @@ def test_bnb_paged_adam(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
-                mock.optim.AdamW,
-                default_adam_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir),
+                    mock.optim.AdamW,
+                    default_adam_kwargs,
+                )
 
     def test_bnb_paged_adam8bit(self):
         mock = Mock()
@@ -4930,12 +5118,13 @@ def test_bnb_paged_adam8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
-                mock.optim.AdamW,
-                default_adam_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir),
+                    mock.optim.AdamW,
+                    default_adam_kwargs,
+                )
 
     def test_bnb_ademamix(self):
         mock = Mock()
@@ -4944,12 +5133,13 @@ def test_bnb_ademamix(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None"),
-                mock.optim.AdEMAMix,
-                default_ademamix_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir),
+                    mock.optim.AdEMAMix,
+                    default_ademamix_kwargs,
+                )
 
     def test_bnb_ademamix8bit(self):
         mock = Mock()
@@ -4958,12 +5148,13 @@ def test_bnb_ademamix8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None"),
-                mock.optim.AdEMAMix,
-                default_ademamix_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir),
+                    mock.optim.AdEMAMix,
+                    default_ademamix_kwargs,
+                )
 
     def test_bnb_paged_ademamix(self):
         mock = Mock()
@@ -4972,12 +5163,13 @@ def test_bnb_paged_ademamix(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None"),
-                mock.optim.AdEMAMix,
-                default_ademamix_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir),
+                    mock.optim.AdEMAMix,
+                    default_ademamix_kwargs,
+                )
 
     def test_bnb_paged_ademamix8bit(self):
         mock = Mock()
@@ -4986,12 +5178,13 @@ def test_bnb_paged_ademamix8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None"),
-                mock.optim.AdEMAMix,
-                default_ademamix_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir),
+                    mock.optim.AdEMAMix,
+                    default_ademamix_kwargs,
+                )
 
     def test_bnb_lion(self):
         mock = Mock()
@@ -5000,12 +5193,13 @@ def test_bnb_lion(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.Lion": mock.optim.Lion,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
-                mock.optim.Lion,
-                default_lion_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.LION, output_dir=tmp_dir),
+                    mock.optim.Lion,
+                    default_lion_kwargs,
+                )
 
     def test_bnb_lion8bit(self):
         mock = Mock()
@@ -5014,12 +5208,13 @@ def test_bnb_lion8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.Lion": mock.optim.Lion,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
-                mock.optim.Lion,
-                default_lion_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir=tmp_dir),
+                    mock.optim.Lion,
+                    default_lion_kwargs,
+                )
 
     def test_bnb_paged_lion8bit(self):
         mock = Mock()
@@ -5028,12 +5223,13 @@ def test_bnb_paged_lion8bit(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.Lion": mock.optim.Lion,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
-                mock.optim.Lion,
-                default_lion_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir),
+                    mock.optim.Lion,
+                    default_lion_kwargs,
+                )
 
     def test_bnb_paged_lion(self):
         mock = Mock()
@@ -5042,93 +5238,103 @@ def test_bnb_paged_lion(self):
             "bitsandbytes.optim": mock.optim,
             "bitsandbytes.optim.Lion": mock.optim.Lion,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None"),
-                mock.optim.Lion,
-                default_lion_kwargs,
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir),
+                    mock.optim.Lion,
+                    default_lion_kwargs,
+                )
 
     def test_bnb_adam8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_adam_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_adam8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_ademamix_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_ademamix8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_ademamix_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_ademamix8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_lion_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_bnb_paged_lion8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir)
 
-        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
-        # bnb will fail even if `bitsandbytes` is installed.
-        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+            # bnb will fail even if `bitsandbytes` is installed.
+            with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
     def test_anyprecision_adamw(self):
         # Pretend that torchdistx is installed and mock torchdistx.optimizers.AnyPrecisionAdamW exists.
@@ -5141,21 +5347,23 @@ def test_anyprecision_adamw(self):
             "torchdistx.optimizers": mock.optimizers,
             "torchdistx.optimizers.AnyPrecisionAdamW.": mock.optimizers.AnyPrecisionAdamW,
         }
-        with patch.dict("sys.modules", modules):
-            self.check_optim_and_kwargs(
-                TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"),
-                mock.optimizers.AnyPrecisionAdamW,
-                dict(default_adam_kwargs, **default_anyprecision_kwargs),
-            )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with patch.dict("sys.modules", modules):
+                self.check_optim_and_kwargs(
+                    TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir),
+                    mock.optimizers.AnyPrecisionAdamW,
+                    dict(default_adam_kwargs, **default_anyprecision_kwargs),
+                )
 
     def test_no_torchdistx_anyprecision_adamw(self):
-        args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir)
 
-        # Pretend that torchdistx does not exist, even if installed. By setting torchdistx to None, importing
-        # torchdistx.optimizers will fail even if torchdistx is installed.
-        with patch.dict("sys.modules", {"torchdistx.optimizers": None}):
-            with self.assertRaises(ValueError):
-                Trainer.get_optimizer_cls_and_kwargs(args)
+            # Pretend that torchdistx does not exist, even if installed. By setting torchdistx to None, importing
+            # torchdistx.optimizers will fail even if torchdistx is installed.
+            with patch.dict("sys.modules", {"torchdistx.optimizers": None}):
+                with self.assertRaises(ValueError):
+                    Trainer.get_optimizer_cls_and_kwargs(args)
 
 
 @require_torch
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 4a6dae67cbc807..053d2cf6397a17 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -21,7 +21,7 @@
 from transformers import set_seed
 from transformers.testing_utils import (
     is_torch_available,
-    require_auto_gptq,
+    require_gptq,
     require_non_xpu,
     require_read_token,
     require_torch,
@@ -319,7 +319,7 @@ def test_hybrid_cache_n_sequences(self):
         self.assertListEqual(decoded, expected_text)
 
     @require_non_xpu
-    @require_auto_gptq
+    @require_gptq
     def test_sink_cache_hard(self):
         tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
         model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index d51f534055872a..9fe7d21b2265ac 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -311,10 +311,10 @@ def test_longrope_rope_numerically(self):
         self.assertEqual(config.rope_theta, 10000.0)
         self.assertFalse(hasattr(config, "partial_rotary_factor"))
 
-        # longrope applies scaling on EACH inv frequency, `short_factor` or `long_factor`, depending on `factor`
+        # longrope applies scaling on EACH inv frequency, `short_factor` or `long_factor`, depending on the seq_len
         dim = config.hidden_size // config.num_attention_heads
-        short_factor = [2.0] * (dim // 2)  # scaling applied when factor == 1.0
-        long_factor = torch.ones(dim // 2).cumsum(0).tolist()  # scaling applied when factor > 1.0
+        short_factor = [2.0] * (dim // 2)  # scaling applied when seq_len <= max_position_embeddings
+        long_factor = torch.ones(dim // 2).cumsum(0).tolist()  # scaling applied when seq_len > max_position_embeddings
 
         rope_fn = ROPE_INIT_FUNCTIONS["default"]
         default_inv_freq, _ = rope_fn(config=config, device=torch_device)
@@ -353,26 +353,18 @@ def test_longrope_rope_numerically(self):
             # Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised.
             rope_config_validation(config)
 
-        # Check 2: Factor == 1.0 -> short factor is applied to the default frequencies
-        factor = 1.0
+        # Check 2: seq_len == 0 -> short factor is applied to the default frequencies
         config.rope_scaling = {
             "rope_type": "longrope",
-            "factor": factor,
+            "factor": 1.0,
             "short_factor": short_factor,
             "long_factor": long_factor,
         }
-        inv_freq, _ = rope_fn(config=config, device=torch_device)
+        inv_freq, _ = rope_fn(config=config, device=torch_device, seq_len=0)
         torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(short_factor).to(torch_device))
 
-        # Check 3: Factor > 1.0 -> long factor is applied to the default frequencies
-        factor = 10.0
-        config.rope_scaling = {
-            "rope_type": "longrope",
-            "factor": factor,
-            "short_factor": short_factor,
-            "long_factor": long_factor,
-        }
-        inv_freq, _ = rope_fn(config=config, device=torch_device)
+        # Check 3: seq_len > max_position_embeddings -> long factor is applied to the default frequencies
+        inv_freq, _ = rope_fn(config=config, device=torch_device, seq_len=config.max_position_embeddings + 1)
         torch.testing.assert_close(inv_freq, default_inv_freq / torch.tensor(long_factor).to(torch_device))
 
     def test_llama3_rope_numerically(self):
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 383f0cbe60e1c9..b8e10ff8ad4d42 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -37,6 +37,7 @@
     AutoModel,
     AutoModelForImageClassification,
     AutoModelForSequenceClassification,
+    LlavaForConditionalGeneration,
     OwlViTForObjectDetection,
     PretrainedConfig,
     is_torch_available,
@@ -300,6 +301,7 @@ def test_local_files_only(self):
 TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
 TINY_MISTRAL = "hf-internal-testing/tiny-random-MistralForCausalLM"
 TINY_IMAGE_CLASSIF = "hf-internal-testing/tiny-random-SiglipForImageClassification"
+TINY_LLAVA = "hf-internal-testing/tiny-random-LlavaForConditionalGeneration"
 
 LOG = logging.get_logger(__name__)
 
@@ -460,6 +462,59 @@ def test_model_from_config_torch_dtype_str(self):
         with self.assertRaises(ValueError):
             model = AutoModel.from_pretrained(TINY_T5, torch_dtype="int64")
 
+    def test_model_from_config_torch_dtype_composite(self):
+        """
+        Test that from_pretrained works with torch_dtype being as a dict per each sub-config in composite config
+        """
+        # should be able to set torch_dtype as a simple string and the model loads it correctly
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, torch_dtype="float32")
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.float32)
+
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, torch_dtype="float16")
+        self.assertEqual(model.language_model.dtype, torch.float16)
+        self.assertEqual(model.vision_tower.dtype, torch.float16)
+
+        # should be able to set torch_dtype as a dict for each sub-config
+        model = LlavaForConditionalGeneration.from_pretrained(
+            TINY_LLAVA, torch_dtype={"text_config": "float32", "vision_config": "float16", "": "bfloat16"}
+        )
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.float16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
+
+        # should be able to set the values as torch.dtype (not str)
+        model = LlavaForConditionalGeneration.from_pretrained(
+            TINY_LLAVA, torch_dtype={"text_config": torch.float32, "vision_config": torch.float16, "": torch.bfloat16}
+        )
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.float16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
+
+        # should be able to set the values in configs directly and pass it to `from_pretrained`
+        config = copy.deepcopy(model.config)
+        config.text_config.torch_dtype = torch.float32
+        config.vision_config.torch_dtype = torch.bfloat16
+        config.torch_dtype = torch.float16
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float16)
+
+        # but if the model has `_keep_in_fp32_modules` then those modules should be in fp32 no matter what
+        LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)
+
+        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
+        with self.assertRaises(ValueError):
+            model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, torch_dtype="int64")
+            model = LlavaForConditionalGeneration.from_pretrained(
+                TINY_LLAVA, torch_dtype={"text_config": "float32", "vision_config": "int64", "": "float16"}
+            )
+
     @require_torch
     def test_model_from_pretrained_meta_device(self):
         def is_on_meta(model_id, dtype):
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 116e26e7834f26..6262b1902aabc3 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -51,6 +51,8 @@
     # generation configs (TODO joao)
     "Gemma2Config": ["tie_word_embeddings", "cache_implementation"],
     "Cohere2Config": ["cache_implementation"],
+    # Dropout with this value was declared but never used
+    "Phi3Config": ["embd_pdrop"],
     # used to compute the property `self.chunk_length`
     "EncodecConfig": ["overlap"],
     # used to compute the property `self.layers_block_type`
@@ -264,6 +266,10 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
                 f"config.{attribute}" in modeling_source
                 or f'getattr(config, "{attribute}"' in modeling_source
                 or f'getattr(self.config, "{attribute}"' in modeling_source
+                or (
+                    "TextConfig" in config_class.__name__
+                    and f"config.get_text_config().{attribute}" in modeling_source
+                )
             ):
                 attribute_used = True
             # Deal with multi-line cases
@@ -299,6 +305,9 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "unk_index",
         "mask_index",
         "image_token_index",  # for VLMs
+        "video_token_index",
+        "image_seq_length",
+        "video_seq_length",
         "image_size",
         "use_cache",
         "out_features",
diff --git a/utils/check_copies.py b/utils/check_copies.py
index ed6a4f68b577f0..daf90b4d4a2c82 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1092,6 +1092,7 @@ def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tup
     "CLIPVisionModel",
     "SiglipVisionModel",
     "ChineseCLIPVisionModel",
+    "VitPoseBackbone",
 ]
 
 # Template for new entries to add in the main README when we have missing models.
diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py
index 86af396e03a07c..5946d6ef1687d7 100644
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -18,7 +18,9 @@
 
 
 def process_file(modular_file_path, generated_modeling_content, file_type="modeling_", fix_and_overwrite=False):
-    file_path = modular_file_path.replace("modular_", f"{file_type}_")
+    file_name_prefix = file_type.split("*")[0]
+    file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
+    file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
     # Read the actual modeling file
     with open(file_path, "r") as modeling_file:
         content = modeling_file.read()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 365213f649cda5..7f3e0c66d55ed0 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -139,6 +139,8 @@
         "Qwen2VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration.
         "MllamaTextModel",  # Building part of bigger (tested) model. # TODO: add tests
         "MllamaVisionModel",  # Building part of bigger (tested) model. # TODO: add tests
+        "Emu3VQVAE",  # Building part of bigger (tested) model
+        "Emu3TextModel",  # Building part of bigger (tested) model
     ]
 )
 
@@ -330,8 +332,11 @@
     "SiglipVisionModel",
     "SiglipTextModel",
     "ChameleonVQVAE",  # no autoclass for VQ-VAE models
+    "VitPoseForPoseEstimation",
     "CLIPTextModel",
     "MoshiForConditionalGeneration",  # no auto class for speech-to-speech
+    "Emu3VQVAE",  # no autoclass for VQ-VAE models
+    "Emu3TextModel",  # Building part of bigger (tested) model
 ]
 
 # DO NOT edit this list!
@@ -993,6 +998,8 @@ def find_all_documented_objects() -> List[str]:
     "logging",  # External module
     "requires_backends",  # Internal function
     "AltRobertaModel",  # Internal module
+    "VitPoseBackbone",  # Internal module
+    "VitPoseBackboneConfig",  # Internal module
 ]
 
 # This list should be empty. Objects in it should get their own doc page.
@@ -1020,6 +1027,7 @@ def find_all_documented_objects() -> List[str]:
     "ResNetBackbone",
     "SwinBackbone",
     "Swinv2Backbone",
+    "TextNetBackbone",
     "TimmBackbone",
     "TimmBackboneConfig",
     "VitDetBackbone",
diff --git a/utils/create_dependency_mapping.py b/utils/create_dependency_mapping.py
index 0df782d1c21740..5cf38cdd1f815d 100644
--- a/utils/create_dependency_mapping.py
+++ b/utils/create_dependency_mapping.py
@@ -3,46 +3,29 @@
 
 
 # Function to perform topological sorting
-def topological_sort(dependencies):
-    new_dependencies = {}
-    graph = defaultdict(list)
+def topological_sort(dependencies: dict):
+    # Nodes are the name of the models to convert (we only add those to the graph)
+    nodes = {node.rsplit("modular_", 1)[1].replace(".py", "") for node in dependencies.keys()}
+    # This will be a graph from models to convert, to models to convert that should be converted before (as they are a dependency)
+    graph = {}
+    name_mapping = {}
     for node, deps in dependencies.items():
-        for dep in deps:
-            if "example" not in node and "auto" not in dep:
-                graph[dep.split(".")[-2]].append(node.split("/")[-2])
-        new_dependencies[node.split("/")[-2]] = node
+        node_name = node.rsplit("modular_", 1)[1].replace(".py", "")
+        dep_names = {dep.split(".")[-2] for dep in deps}
+        dependencies = {dep for dep in dep_names if dep in nodes and dep != node_name}
+        graph[node_name] = dependencies
+        name_mapping[node_name] = node
 
-    # Create a graph and in-degree count for each node
-    def filter_one_by_one(filtered_list, reverse):
-        if len(reverse) == 0:
-            return filtered_list
+    sorting_list = []
+    while len(graph) > 0:
+        # Find the nodes with 0 out-degree
+        leaf_nodes = {node for node in graph if len(graph[node]) == 0}
+        # Add them to the list
+        sorting_list += list(leaf_nodes)
+        # Remove the leafs from the graph (and from the deps of other nodes)
+        graph = {node: deps - leaf_nodes for node, deps in graph.items() if node not in leaf_nodes}
 
-        graph = defaultdict(list)
-        # Build the graph
-        for node, deps in reverse.items():
-            for dep in deps:
-                graph[dep].append(node)
-
-        base_modules = set(reverse.keys()) - set(graph.keys())
-        if base_modules == reverse.keys():
-            # we are at the end
-            return filtered_list + list(graph.keys())
-        to_add = []
-        for k in graph.keys():
-            if len(graph[k]) == 1 and graph[k][0] in base_modules:
-                if graph[k][0] in reverse:
-                    del reverse[graph[k][0]]
-                if k not in filtered_list:
-                    to_add += [k]
-        for k in base_modules:
-            if k not in filtered_list:
-                to_add += [k]
-        filtered_list += list(to_add)
-        return filter_one_by_one(filtered_list, reverse)
-
-    final_order = filter_one_by_one([], graph)
-
-    return [new_dependencies.get(k) for k in final_order if k in new_dependencies]
+    return [name_mapping[x] for x in sorting_list]
 
 
 # Function to extract class and import info from a file
@@ -54,7 +37,7 @@ def extract_classes_and_imports(file_path):
     for node in ast.walk(tree):
         if isinstance(node, (ast.Import, ast.ImportFrom)):
             module = node.module if isinstance(node, ast.ImportFrom) else None
-            if module and (".modeling_" in module):
+            if module and (".modeling_" in module or "transformers.models" in module):
                 imports.add(module)
     return imports
 
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 28fcc4fc7b9e1a..7cba82f6df13bf 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -58,7 +58,7 @@ def get_module_source_from_name(module_name: str) -> str:
 def preserve_case_replace(text, patterns: dict, default_name: str):
     # Create a regex pattern to match all variations
     regex_pattern = "|".join(re.escape(key) for key in patterns.keys())
-    compiled_regex = re.compile(f"({regex_pattern})(.|$)", re.IGNORECASE | re.DOTALL)
+    compiled_regex = re.compile(f"(?<![a-z0-9])({regex_pattern})(.|$)", re.IGNORECASE | re.DOTALL)
 
     def replace(match):
         matched_pattern = match.group(1)
@@ -1059,6 +1059,7 @@ def replace_class_node(
     "Tokenizer": "tokenization",
     "Processor": "processing",
     "ImageProcessor": "image_processing",
+    "ImageProcessorFast": "image_processing*_fast",  # "*" indicates where to insert the model name before the "_fast" suffix
     "FeatureExtractor": "feature_extractor",
     "ProcessorKwargs": "processing",
     "ImagesKwargs": "processing",
@@ -1658,11 +1659,16 @@ def convert_modular_file(modular_file):
 
 def save_modeling_file(modular_file, converted_file):
     for file_type in converted_file.keys():
+        file_name_prefix = file_type.split("*")[0]
+        file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
+        new_file_name = modular_file.replace("modular_", f"{file_name_prefix}_").replace(
+            ".py", f"{file_name_suffix}.py"
+        )
         non_comment_lines = len(
             [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")]
         )
         if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0:
-            with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
+            with open(new_file_name, "w", encoding="utf-8") as f:
                 f.write(converted_file[file_type][0])
         else:
             non_comment_lines = len(
@@ -1670,7 +1676,7 @@ def save_modeling_file(modular_file, converted_file):
             )
             if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0:
                 logger.warning("The modeling code contains errors, it's written without formatting")
-                with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f:
+                with open(new_file_name, "w", encoding="utf-8") as f:
                     f.write(converted_file[file_type][1])
 
 
@@ -1685,9 +1691,13 @@ def save_modeling_file(modular_file, converted_file):
     args = parser.parse_args()
     if args.files_to_parse == ["all"]:
         args.files_to_parse = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
-        args.files_to_parse += glob.glob("examples/**/modular_*.py", recursive=True)
+    if args.files_to_parse == ["examples"]:
+        args.files_to_parse = glob.glob("examples/**/modular_*.py", recursive=True)
+
+    priority_list = find_priority_list(args.files_to_parse)
+    assert len(priority_list) == len(args.files_to_parse), "Some files will not be converted"
 
-    for file_name in find_priority_list(args.files_to_parse):
+    for file_name in priority_list:
         print(f"Converting {file_name} to a single model single file format")
         module_path = file_name.replace("/", ".").replace(".py", "").replace("src.", "")
         converted_files = convert_modular_file(file_name)