diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index e917efbc73d7dc..2e502d02fdefab 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -747,7 +747,7 @@ def forward( class IdeficsGatedCrossAttentionLayer(nn.Module): - def __init__(self, config: IdeficsConfig): + def __init__(self, config: IdeficsConfig, layer_idx: int = None): super().__init__() self.hidden_size = config.hidden_size self.cross_attn = IdeficsAttention( @@ -757,6 +757,7 @@ def __init__(self, config: IdeficsConfig): dropout=config.dropout, config=config, qk_layer_norms=config.qk_layer_norms, + layer_idx=layer_idx, ) self.mlp = IdeficsMLP( hidden_size=self.hidden_size, @@ -1048,7 +1049,7 @@ def __init__(self, config: IdeficsConfig): self.cross_layer_interval = config.cross_layer_interval num_cross_layers = config.num_hidden_layers // self.cross_layer_interval self.gated_cross_attn_layers = nn.ModuleList( - [IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)] + [IdeficsGatedCrossAttentionLayer(config, layer_idx=i) for i in range(num_cross_layers)] ) self.gradient_checkpointing = False diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 444a970e5a3606..3b0e92aac91e45 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -20,6 +20,7 @@ from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput from ...processing_utils import ( ImagesKwargs, ProcessingKwargs, @@ -203,7 +204,10 @@ class IdeficsProcessor(ProcessorMixin): An instance of [`IdeficsImageProcessor`]. The image processor is a required input. tokenizer (`LlamaTokenizerFast`): An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input. - image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image) + image_size (`int`, *optional*, defaults to 224): + Image size (assuming a square image) + add_end_of_utterance_token (`str`, *optional*): + The string representation of token representing end of utterance """ attributes = ["image_processor", "tokenizer"] @@ -240,7 +244,7 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True) def __call__( self, - images=None, + images: Union[ImageInput, List[ImageInput], str, List[str], List[List[str]]] = None, text: Union[ TextInput, PreTokenizedInput, @@ -257,7 +261,7 @@ def __call__( the model was trained on and prepares the image pixel values for the model to process. Args: - images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`): + images (`Union[ImageInput, List[ImageInput], str, List[str], List[List[str]]]`): either a single image or a batched list of images - can be passed in when text contains only text prompts, in order to use the image-text-to-text behavior. text (`Union[List[TextInput], [List[List[TextInput]]]]`):