diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index e917efbc73d7dc..2e502d02fdefab 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -747,7 +747,7 @@ def forward(
 
 
 class IdeficsGatedCrossAttentionLayer(nn.Module):
-    def __init__(self, config: IdeficsConfig):
+    def __init__(self, config: IdeficsConfig, layer_idx: int = None):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.cross_attn = IdeficsAttention(
@@ -757,6 +757,7 @@ def __init__(self, config: IdeficsConfig):
             dropout=config.dropout,
             config=config,
             qk_layer_norms=config.qk_layer_norms,
+            layer_idx=layer_idx,
         )
         self.mlp = IdeficsMLP(
             hidden_size=self.hidden_size,
@@ -1048,7 +1049,7 @@ def __init__(self, config: IdeficsConfig):
         self.cross_layer_interval = config.cross_layer_interval
         num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
         self.gated_cross_attn_layers = nn.ModuleList(
-            [IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
+            [IdeficsGatedCrossAttentionLayer(config, layer_idx=i) for i in range(num_cross_layers)]
         )
         self.gradient_checkpointing = False
 
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 444a970e5a3606..3b0e92aac91e45 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -20,6 +20,7 @@
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...processing_utils import (
     ImagesKwargs,
     ProcessingKwargs,
@@ -203,7 +204,10 @@ class IdeficsProcessor(ProcessorMixin):
             An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
         tokenizer (`LlamaTokenizerFast`):
             An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
-        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
+        image_size (`int`, *optional*, defaults to 224):
+            Image size (assuming a square image)
+        add_end_of_utterance_token (`str`, *optional*):
+            The string representation of token representing end of utterance
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -240,7 +244,7 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
     @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
     def __call__(
         self,
-        images=None,
+        images: Union[ImageInput, List[ImageInput], str, List[str], List[List[str]]] = None,
         text: Union[
             TextInput,
             PreTokenizedInput,
@@ -257,7 +261,7 @@ def __call__(
         the model was trained on and prepares the image pixel values for the model to process.
 
         Args:
-            images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
+            images (`Union[ImageInput, List[ImageInput], str, List[str], List[List[str]]]`):
                 either a single image or a batched list of images - can be passed in when text contains only text prompts,
                 in order to use the image-text-to-text behavior.
             text (`Union[List[TextInput], [List[List[TextInput]]]]`):