From 184a215c8e1c2f9a4cae394740b8d9c82d07decf Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 13:28:30 +0000
Subject: [PATCH 1/8] bugfix processing empty images

---
 .../models/idefics3/processing_idefics3.py    | 75 ++++++++++---------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 872f5206f20175..1b0e6e22a5e79f 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -283,46 +283,49 @@ def __call__(
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)
 
-        if text is not None:
-            if n_images_in_images != n_images_in_text:
-                raise ValueError(
-                    f"The number of images in the text {n_images_in_text} and images  {n_images_in_images} should be the same."
-                )
-
-            image_rows = inputs.pop("rows", [[0] * len(text)])
-            image_cols = inputs.pop("cols", [[0] * len(text)])
-
-            fake_image_token = self.fake_image_token.content
-            image_token = self.image_token.content
-            global_img_token = self.global_image_tag
-
-            prompt_strings = []
-            for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
-                # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
-                image_prompt_strings = []
-                for n_rows, n_cols in zip(sample_rows, sample_cols):
-                    image_prompt_string = get_image_prompt_string(
-                        n_rows,
-                        n_cols,
-                        image_seq_len,
-                        image_token=image_token,
-                        fake_token_around_image=fake_image_token,
-                        global_img_token=global_img_token,
+            if text is not None:
+                if n_images_in_images != n_images_in_text:
+                    raise ValueError(
+                        f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                     )
-                    image_prompt_strings.append(image_prompt_string)
 
-                split_sample = sample.split(image_token)
-                if len(split_sample) == 0:
-                    raise ValueError("The image token should be present in the text.")
+                image_rows = inputs.pop("rows", [[0] * len(text)])
+                image_cols = inputs.pop("cols", [[0] * len(text)])
+
+                fake_image_token = self.fake_image_token.content
+                image_token = self.image_token.content
+                global_img_token = self.global_image_tag
+
+                prompt_strings = []
+                for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+                    # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+                    image_prompt_strings = []
+                    for n_rows, n_cols in zip(sample_rows, sample_cols):
+                        image_prompt_string = get_image_prompt_string(
+                            n_rows,
+                            n_cols,
+                            image_seq_len,
+                            image_token=image_token,
+                            fake_token_around_image=fake_image_token,
+                            global_img_token=global_img_token,
+                        )
+                        image_prompt_strings.append(image_prompt_string)
+
+                    split_sample = sample.split(image_token)
+                    if len(split_sample) == 0:
+                        raise ValueError("The image token should be present in the text.")
+
+                    # Place in the image prompt strings where the image tokens are
+                    sample = split_sample[0]
+                    for i, image_prompt_string in enumerate(image_prompt_strings):
+                        sample += image_prompt_string + split_sample[i + 1]
+                    prompt_strings.append(sample)
 
-                # Place in the image prompt strings where the image tokens are
-                sample = split_sample[0]
-                for i, image_prompt_string in enumerate(image_prompt_strings):
-                    sample += image_prompt_string + split_sample[i + 1]
-                prompt_strings.append(sample)
+                text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
+        else:
+            text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
 
-            text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
-            inputs.update(text_inputs)
+        inputs.update(text_inputs)
 
         return inputs
 

From f2c808cb7a9c067e87a6314a3910d0ca0e0a1d6b Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 13:34:51 +0000
Subject: [PATCH 2/8] fix

---
 src/transformers/models/idefics3/processing_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 1b0e6e22a5e79f..de70ffd18a7c18 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -322,7 +322,7 @@ def __call__(
                     prompt_strings.append(sample)
 
                 text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
-        else:
+        elif text is not None:
             text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
 
         inputs.update(text_inputs)

From f994edcbd10079378bfb1f100b60b68b76543045 Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 13:40:21 +0000
Subject: [PATCH 3/8] fix

---
 src/transformers/models/idefics3/processing_idefics3.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index de70ffd18a7c18..a64d62cef012a9 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -322,10 +322,11 @@ def __call__(
                     prompt_strings.append(sample)
 
                 text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
+                inputs.update(text_inputs)
+
         elif text is not None:
             text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
-
-        inputs.update(text_inputs)
+            inputs.update(text_inputs)
 
         return inputs
 

From f83689cb493b1872648e1506c3a2ee5552602195 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miquel=20Farr=C3=A9?= <miquel.farre@gmail.com>
Date: Fri, 20 Dec 2024 19:54:37 +0100
Subject: [PATCH 4/8] Update
 src/transformers/models/idefics3/processing_idefics3.py

Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
---
 src/transformers/models/idefics3/processing_idefics3.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index a64d62cef012a9..7ca5829e2063d8 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -325,6 +325,10 @@ def __call__(
                 inputs.update(text_inputs)
 
         elif text is not None:
+            if any(n_images_in_text):
+                raise ValueError(
+                    f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed."
+                )
             text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 

From 3c1fc533c43c6e3c1beb1b167925ddc73ef83f3d Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 19:12:49 +0000
Subject: [PATCH 5/8] adding tests

---
 .../idefics3/test_processor_idefics3.py       | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index 52d2f1539a4867..d315ce612fda8f 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -505,3 +505,75 @@ def test_unstructured_kwargs(self):
 
         self.assertEqual(inputs["pixel_values"].shape[3], 32)
         self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    @require_torch
+    @require_vision
+    def test_text_only_inference(self):
+            """Test that the processor works correctly with text-only input."""
+            processor = self.get_processor()
+            
+            text = "This is a simple text without images."
+            inputs = processor(text=text)
+            
+            tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
+            expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
+            
+            self.assertEqual(inputs["input_ids"], expected_input_ids)
+            self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+            self.assertTrue("pixel_values" not in inputs)
+            self.assertTrue("pixel_attention_mask" not in inputs)
+            
+            # Test batch of texts without image tokens
+            texts = ["First text.", "Second piece of text."]
+            batch_inputs = processor(text=texts, padding=True)
+            
+            tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
+            tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
+            
+            expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
+            expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
+            
+            # Pad the shorter sequence
+            pad_len = len(expected_2) - len(expected_1)
+            if pad_len > 0:
+                padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
+                expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
+                self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
+                self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
+            else:
+                pad_len = -pad_len
+                padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
+                expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
+                self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
+                self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
+
+    @require_torch
+    @require_vision
+    def test_missing_images_error(self):
+        """Test that appropriate error is raised when images are referenced but not provided."""
+        processor = self.get_processor()
+        
+        # Test single text with image token but no image
+        text = "Let me show you this image: <image> What do you think?"
+        with self.assertRaises(ValueError) as context:
+            processor(text=text)
+        self.assertTrue("Number of images" in str(context.exception))
+        
+        # Test batch with image tokens but no images
+        texts = [
+            "First text with <image> token.",
+            "Second text <image> with token.",
+        ]
+        with self.assertRaises(ValueError) as context:
+            processor(text=texts)
+        self.assertTrue("Number of images" in str(context.exception))
+        
+        # Test with empty images list
+        with self.assertRaises(ValueError) as context:
+            processor(text=text, images=[])
+        self.assertTrue("Number of images" in str(context.exception))
+        
+        # Test with batch and empty images lists
+        with self.assertRaises(ValueError) as context:
+            processor(text=texts, images=[[], []])
+        self.assertTrue("Number of images" in str(context.exception))
\ No newline at end of file

From b83ffc0fe37fade253f2459aedaec4bfe3b42166 Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 19:21:55 +0000
Subject: [PATCH 6/8] fix

---
 .../idefics3/test_processor_idefics3.py       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index d315ce612fda8f..b22cd642b21b6c 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -511,28 +511,28 @@ def test_unstructured_kwargs(self):
     def test_text_only_inference(self):
             """Test that the processor works correctly with text-only input."""
             processor = self.get_processor()
-            
+
             text = "This is a simple text without images."
             inputs = processor(text=text)
-            
+
             tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
             expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
-            
+
             self.assertEqual(inputs["input_ids"], expected_input_ids)
             self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
             self.assertTrue("pixel_values" not in inputs)
             self.assertTrue("pixel_attention_mask" not in inputs)
-            
+
             # Test batch of texts without image tokens
             texts = ["First text.", "Second piece of text."]
             batch_inputs = processor(text=texts, padding=True)
-            
+
             tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
             tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
-            
+
             expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
             expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
-            
+
             # Pad the shorter sequence
             pad_len = len(expected_2) - len(expected_1)
             if pad_len > 0:
@@ -552,13 +552,13 @@ def test_text_only_inference(self):
     def test_missing_images_error(self):
         """Test that appropriate error is raised when images are referenced but not provided."""
         processor = self.get_processor()
-        
+
         # Test single text with image token but no image
         text = "Let me show you this image: <image> What do you think?"
         with self.assertRaises(ValueError) as context:
             processor(text=text)
-        self.assertTrue("Number of images" in str(context.exception))
-        
+        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
         # Test batch with image tokens but no images
         texts = [
             "First text with <image> token.",
@@ -566,14 +566,14 @@ def test_missing_images_error(self):
         ]
         with self.assertRaises(ValueError) as context:
             processor(text=texts)
-        self.assertTrue("Number of images" in str(context.exception))
-        
+        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
         # Test with empty images list
         with self.assertRaises(ValueError) as context:
             processor(text=text, images=[])
-        self.assertTrue("Number of images" in str(context.exception))
-        
+        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
         # Test with batch and empty images lists
         with self.assertRaises(ValueError) as context:
             processor(text=texts, images=[[], []])
-        self.assertTrue("Number of images" in str(context.exception))
\ No newline at end of file
+        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
\ No newline at end of file

From a82bc253acfa25c5d0d0686f0f3cfc4b4b7d759f Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 19:24:27 +0000
Subject: [PATCH 7/8] fix

---
 tests/models/idefics3/test_processor_idefics3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index b22cd642b21b6c..f993b534fc252e 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -576,4 +576,4 @@ def test_missing_images_error(self):
         # Test with batch and empty images lists
         with self.assertRaises(ValueError) as context:
             processor(text=texts, images=[[], []])
-        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
\ No newline at end of file
+        self.assertTrue("tokens in the text but no images were passed" in str(context.exception))

From 2fd930a1fe9377c2a2200a46bca7bb86fc0af8b7 Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Fri, 20 Dec 2024 21:04:53 +0000
Subject: [PATCH 8/8] fix

---
 .../idefics3/test_processor_idefics3.py       | 81 +++++++++----------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index f993b534fc252e..36c5d294844939 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -509,43 +509,43 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_text_only_inference(self):
-            """Test that the processor works correctly with text-only input."""
-            processor = self.get_processor()
-
-            text = "This is a simple text without images."
-            inputs = processor(text=text)
-
-            tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
-            expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
-
-            self.assertEqual(inputs["input_ids"], expected_input_ids)
-            self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
-            self.assertTrue("pixel_values" not in inputs)
-            self.assertTrue("pixel_attention_mask" not in inputs)
-
-            # Test batch of texts without image tokens
-            texts = ["First text.", "Second piece of text."]
-            batch_inputs = processor(text=texts, padding=True)
-
-            tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
-            tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
-
-            expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
-            expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
-
-            # Pad the shorter sequence
-            pad_len = len(expected_2) - len(expected_1)
-            if pad_len > 0:
-                padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
-                expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
-                self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
-                self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
-            else:
-                pad_len = -pad_len
-                padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
-                expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
-                self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
-                self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
+        """Test that the processor works correctly with text-only input."""
+        processor = self.get_processor()
+
+        text = "This is a simple text without images."
+        inputs = processor(text=text)
+
+        tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
+        expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
+
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertTrue("pixel_values" not in inputs)
+        self.assertTrue("pixel_attention_mask" not in inputs)
+
+        # Test batch of texts without image tokens
+        texts = ["First text.", "Second piece of text."]
+        batch_inputs = processor(text=texts, padding=True)
+
+        tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
+        tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
+
+        expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
+        expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
+
+        # Pad the shorter sequence
+        pad_len = len(expected_2) - len(expected_1)
+        if pad_len > 0:
+            padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
+            expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
+            self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
+            self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
+        else:
+            pad_len = -pad_len
+            padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
+            expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
+            self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
+            self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
 
     @require_torch
     @require_vision
@@ -568,12 +568,11 @@ def test_missing_images_error(self):
             processor(text=texts)
         self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
 
-        # Test with empty images list
+        # Test with None as Images
         with self.assertRaises(ValueError) as context:
-            processor(text=text, images=[])
+            processor(text=text, images=None)
         self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
 
-        # Test with batch and empty images lists
         with self.assertRaises(ValueError) as context:
-            processor(text=texts, images=[[], []])
+            processor(text=texts, images=None)
         self.assertTrue("tokens in the text but no images were passed" in str(context.exception))