From 184a215c8e1c2f9a4cae394740b8d9c82d07decf Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 13:28:30 +0000 Subject: [PATCH 1/8] bugfix processing empty images --- .../models/idefics3/processing_idefics3.py | 75 ++++++++++--------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 872f5206f20175..1b0e6e22a5e79f 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -283,46 +283,49 @@ def __call__( image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) inputs.update(image_inputs) - if text is not None: - if n_images_in_images != n_images_in_text: - raise ValueError( - f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." - ) - - image_rows = inputs.pop("rows", [[0] * len(text)]) - image_cols = inputs.pop("cols", [[0] * len(text)]) - - fake_image_token = self.fake_image_token.content - image_token = self.image_token.content - global_img_token = self.global_image_tag - - prompt_strings = [] - for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): - # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` - image_prompt_strings = [] - for n_rows, n_cols in zip(sample_rows, sample_cols): - image_prompt_string = get_image_prompt_string( - n_rows, - n_cols, - image_seq_len, - image_token=image_token, - fake_token_around_image=fake_image_token, - global_img_token=global_img_token, + if text is not None: + if n_images_in_images != n_images_in_text: + raise ValueError( + f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." ) - image_prompt_strings.append(image_prompt_string) - split_sample = sample.split(image_token) - if len(split_sample) == 0: - raise ValueError("The image token should be present in the text.") + image_rows = inputs.pop("rows", [[0] * len(text)]) + image_cols = inputs.pop("cols", [[0] * len(text)]) + + fake_image_token = self.fake_image_token.content + image_token = self.image_token.content + global_img_token = self.global_image_tag + + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): + # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = get_image_prompt_string( + n_rows, + n_cols, + image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError("The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) - # Place in the image prompt strings where the image tokens are - sample = split_sample[0] - for i, image_prompt_string in enumerate(image_prompt_strings): - sample += image_prompt_string + split_sample[i + 1] - prompt_strings.append(sample) + text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) + else: + text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) - text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) - inputs.update(text_inputs) + inputs.update(text_inputs) return inputs From f2c808cb7a9c067e87a6314a3910d0ca0e0a1d6b Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 13:34:51 +0000 Subject: [PATCH 2/8] fix --- src/transformers/models/idefics3/processing_idefics3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 1b0e6e22a5e79f..de70ffd18a7c18 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -322,7 +322,7 @@ def __call__( prompt_strings.append(sample) text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) - else: + elif text is not None: text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) From f994edcbd10079378bfb1f100b60b68b76543045 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 13:40:21 +0000 Subject: [PATCH 3/8] fix --- src/transformers/models/idefics3/processing_idefics3.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index de70ffd18a7c18..a64d62cef012a9 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -322,10 +322,11 @@ def __call__( prompt_strings.append(sample) text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) + inputs.update(text_inputs) + elif text is not None: text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) - - inputs.update(text_inputs) + inputs.update(text_inputs) return inputs From f83689cb493b1872648e1506c3a2ee5552602195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Farr=C3=A9?= Date: Fri, 20 Dec 2024 19:54:37 +0100 Subject: [PATCH 4/8] Update src/transformers/models/idefics3/processing_idefics3.py Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> --- src/transformers/models/idefics3/processing_idefics3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index a64d62cef012a9..7ca5829e2063d8 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -325,6 +325,10 @@ def __call__( inputs.update(text_inputs) elif text is not None: + if any(n_images_in_text): + raise ValueError( + f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed." + ) text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) From 3c1fc533c43c6e3c1beb1b167925ddc73ef83f3d Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 19:12:49 +0000 Subject: [PATCH 5/8] adding tests --- .../idefics3/test_processor_idefics3.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index 52d2f1539a4867..d315ce612fda8f 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -505,3 +505,75 @@ def test_unstructured_kwargs(self): self.assertEqual(inputs["pixel_values"].shape[3], 32) self.assertEqual(len(inputs["input_ids"][0]), 120) + + @require_torch + @require_vision + def test_text_only_inference(self): + """Test that the processor works correctly with text-only input.""" + processor = self.get_processor() + + text = "This is a simple text without images." + inputs = processor(text=text) + + tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] + + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertTrue("pixel_values" not in inputs) + self.assertTrue("pixel_attention_mask" not in inputs) + + # Test batch of texts without image tokens + texts = ["First text.", "Second piece of text."] + batch_inputs = processor(text=texts, padding=True) + + tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) + tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) + + expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] + expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] + + # Pad the shorter sequence + pad_len = len(expected_2) - len(expected_1) + if pad_len > 0: + padded_expected_1 = [self.padding_token_id] * pad_len + expected_1 + expected_attention_1 = [0] * pad_len + [1] * len(expected_1) + self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)]) + else: + pad_len = -pad_len + padded_expected_2 = [self.padding_token_id] * pad_len + expected_2 + expected_attention_2 = [0] * pad_len + [1] * len(expected_2) + self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2]) + + @require_torch + @require_vision + def test_missing_images_error(self): + """Test that appropriate error is raised when images are referenced but not provided.""" + processor = self.get_processor() + + # Test single text with image token but no image + text = "Let me show you this image: What do you think?" + with self.assertRaises(ValueError) as context: + processor(text=text) + self.assertTrue("Number of images" in str(context.exception)) + + # Test batch with image tokens but no images + texts = [ + "First text with token.", + "Second text with token.", + ] + with self.assertRaises(ValueError) as context: + processor(text=texts) + self.assertTrue("Number of images" in str(context.exception)) + + # Test with empty images list + with self.assertRaises(ValueError) as context: + processor(text=text, images=[]) + self.assertTrue("Number of images" in str(context.exception)) + + # Test with batch and empty images lists + with self.assertRaises(ValueError) as context: + processor(text=texts, images=[[], []]) + self.assertTrue("Number of images" in str(context.exception)) \ No newline at end of file From b83ffc0fe37fade253f2459aedaec4bfe3b42166 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 19:21:55 +0000 Subject: [PATCH 6/8] fix --- .../idefics3/test_processor_idefics3.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index d315ce612fda8f..b22cd642b21b6c 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -511,28 +511,28 @@ def test_unstructured_kwargs(self): def test_text_only_inference(self): """Test that the processor works correctly with text-only input.""" processor = self.get_processor() - + text = "This is a simple text without images." inputs = processor(text=text) - + tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] - + self.assertEqual(inputs["input_ids"], expected_input_ids) self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) self.assertTrue("pixel_values" not in inputs) self.assertTrue("pixel_attention_mask" not in inputs) - + # Test batch of texts without image tokens texts = ["First text.", "Second piece of text."] batch_inputs = processor(text=texts, padding=True) - + tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) - + expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] - + # Pad the shorter sequence pad_len = len(expected_2) - len(expected_1) if pad_len > 0: @@ -552,13 +552,13 @@ def test_text_only_inference(self): def test_missing_images_error(self): """Test that appropriate error is raised when images are referenced but not provided.""" processor = self.get_processor() - + # Test single text with image token but no image text = "Let me show you this image: What do you think?" with self.assertRaises(ValueError) as context: processor(text=text) - self.assertTrue("Number of images" in str(context.exception)) - + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + # Test batch with image tokens but no images texts = [ "First text with token.", @@ -566,14 +566,14 @@ def test_missing_images_error(self): ] with self.assertRaises(ValueError) as context: processor(text=texts) - self.assertTrue("Number of images" in str(context.exception)) - + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + # Test with empty images list with self.assertRaises(ValueError) as context: processor(text=text, images=[]) - self.assertTrue("Number of images" in str(context.exception)) - + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + # Test with batch and empty images lists with self.assertRaises(ValueError) as context: processor(text=texts, images=[[], []]) - self.assertTrue("Number of images" in str(context.exception)) \ No newline at end of file + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) \ No newline at end of file From a82bc253acfa25c5d0d0686f0f3cfc4b4b7d759f Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 19:24:27 +0000 Subject: [PATCH 7/8] fix --- tests/models/idefics3/test_processor_idefics3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index b22cd642b21b6c..f993b534fc252e 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -576,4 +576,4 @@ def test_missing_images_error(self): # Test with batch and empty images lists with self.assertRaises(ValueError) as context: processor(text=texts, images=[[], []]) - self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) \ No newline at end of file + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) From 2fd930a1fe9377c2a2200a46bca7bb86fc0af8b7 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Fri, 20 Dec 2024 21:04:53 +0000 Subject: [PATCH 8/8] fix --- .../idefics3/test_processor_idefics3.py | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index f993b534fc252e..36c5d294844939 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -509,43 +509,43 @@ def test_unstructured_kwargs(self): @require_torch @require_vision def test_text_only_inference(self): - """Test that the processor works correctly with text-only input.""" - processor = self.get_processor() - - text = "This is a simple text without images." - inputs = processor(text=text) - - tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) - expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] - - self.assertEqual(inputs["input_ids"], expected_input_ids) - self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) - self.assertTrue("pixel_values" not in inputs) - self.assertTrue("pixel_attention_mask" not in inputs) - - # Test batch of texts without image tokens - texts = ["First text.", "Second piece of text."] - batch_inputs = processor(text=texts, padding=True) - - tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) - tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) - - expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] - expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] - - # Pad the shorter sequence - pad_len = len(expected_2) - len(expected_1) - if pad_len > 0: - padded_expected_1 = [self.padding_token_id] * pad_len + expected_1 - expected_attention_1 = [0] * pad_len + [1] * len(expected_1) - self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2]) - self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)]) - else: - pad_len = -pad_len - padded_expected_2 = [self.padding_token_id] * pad_len + expected_2 - expected_attention_2 = [0] * pad_len + [1] * len(expected_2) - self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2]) - self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2]) + """Test that the processor works correctly with text-only input.""" + processor = self.get_processor() + + text = "This is a simple text without images." + inputs = processor(text=text) + + tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] + + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertTrue("pixel_values" not in inputs) + self.assertTrue("pixel_attention_mask" not in inputs) + + # Test batch of texts without image tokens + texts = ["First text.", "Second piece of text."] + batch_inputs = processor(text=texts, padding=True) + + tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) + tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) + + expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] + expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] + + # Pad the shorter sequence + pad_len = len(expected_2) - len(expected_1) + if pad_len > 0: + padded_expected_1 = [self.padding_token_id] * pad_len + expected_1 + expected_attention_1 = [0] * pad_len + [1] * len(expected_1) + self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)]) + else: + pad_len = -pad_len + padded_expected_2 = [self.padding_token_id] * pad_len + expected_2 + expected_attention_2 = [0] * pad_len + [1] * len(expected_2) + self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2]) @require_torch @require_vision @@ -568,12 +568,11 @@ def test_missing_images_error(self): processor(text=texts) self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) - # Test with empty images list + # Test with None as Images with self.assertRaises(ValueError) as context: - processor(text=text, images=[]) + processor(text=text, images=None) self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) - # Test with batch and empty images lists with self.assertRaises(ValueError) as context: - processor(text=texts, images=[[], []]) + processor(text=texts, images=None) self.assertTrue("tokens in the text but no images were passed" in str(context.exception))