From 7a7cb6160d30e55d601dd3a04c827ce56605a66b Mon Sep 17 00:00:00 2001 From: Davy Chen Date: Thu, 18 Jul 2024 15:03:52 +0800 Subject: [PATCH 1/3] # Pass Image color channels information to Transformers Background: In Huggingface Transformers' image processor, e.g. CLIPImageProcessor, the constructor requires input of input_data_format, which gives the Image's color channels being in the first or the last position in its shape. For example, if an image's shape is (512, 512, 3), it means its resolution is 512*512 pixels, and it has RBG, 3 color channels. In this case, input_data_format is ImageChannelDimension.LAST or ChannelDimension.LAST in Transformers. Sometimes, people would use customized Image format in a shape of (3, 512, 512) for performance purpose. Transformers requires users to point it out, or it would infer to tell it from its shape. Generally, an image would have 1 or 3 color channels representing Gray or RGB. So, the inferring algorithm in Transformers looks for 1 or 3 values in the image's shape. If your input images are in the shape of (3, xxx, 1) or (1, xxx, 3), the inferring algorithm would get confused, and raise following exception: 'The channel dimension is ambiguous. Got image shape (1, xxx, 3). Assuming channels are the first dimension.' 'ValueError: mean must have 1 elements if it is an iterable, got 3' Fix: 1. Add a class ImageChannelDimension to define 2 possible Image color channels position in an Image's shape 2. Input this information in model.encode method, and pass it to Tokenizer and image processor from Transformers. --- sentence_transformers/SentenceTransformer.py | 11 ++++++++++- sentence_transformers/models/CLIPModel.py | 5 +++-- sentence_transformers/util.py | 6 ++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index c2d476782..25cd094d2 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -35,6 +35,7 @@ from .models import Normalize, Pooling, Transformer from .quantization import quantize_embeddings from .util import ( + ImageChannelDimension, batch_to_device, get_device_name, import_from_string, @@ -367,6 +368,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., + input_data_format: str = ImageChannelDimension.LAST, ) -> Tensor: ... @overload @@ -383,6 +385,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., + input_data_format: str = ImageChannelDimension.LAST, ) -> np.ndarray: ... @overload @@ -399,6 +402,7 @@ def encode( convert_to_tensor: Literal[True] = ..., device: str = ..., normalize_embeddings: bool = ..., + input_data_format: str = ImageChannelDimension.LAST, ) -> Tensor: ... @overload @@ -415,6 +419,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., + input_data_format: str = ImageChannelDimension.LAST, ) -> list[Tensor]: ... def encode( @@ -430,6 +435,7 @@ def encode( convert_to_tensor: bool = False, device: str = None, normalize_embeddings: bool = False, + input_data_format: str = ImageChannelDimension.LAST, ) -> list[Tensor] | np.ndarray | Tensor: """ Computes sentence embeddings. @@ -491,6 +497,9 @@ def encode( self.is_hpu_graph_enabled = True self.eval() + # Will be used in Image Tokenizer + self.input_data_format = input_data_format + if show_progress_bar is None: show_progress_bar = logger.getEffectiveLevel() in (logging.INFO, logging.DEBUG) @@ -991,7 +1000,7 @@ def tokenize(self, texts: list[str] | list[dict] | list[tuple[str, str]]) -> dic Dict[str, Tensor]: A dictionary of tensors with the tokenized texts. Common keys are "input_ids", "attention_mask", and "token_type_ids". """ - return self._first_module().tokenize(texts) + return self._first_module().tokenize(texts, input_data_format=self.input_data_format) def get_sentence_features(self, *features) -> dict[Literal["sentence_embedding"], torch.Tensor]: return self._first_module().get_sentence_features(*features) diff --git a/sentence_transformers/models/CLIPModel.py b/sentence_transformers/models/CLIPModel.py index 4eccb3c55..1dacf830a 100644 --- a/sentence_transformers/models/CLIPModel.py +++ b/sentence_transformers/models/CLIPModel.py @@ -4,6 +4,7 @@ import transformers from PIL import Image from torch import nn +from ..util import ImageChannelDimension class CLIPModel(nn.Module): @@ -51,7 +52,7 @@ def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: return features - def tokenize(self, texts, padding: str | bool = True) -> dict[str, torch.Tensor]: + def tokenize(self, texts, padding: str | bool = True, input_data_format: str = ImageChannelDimension.LAST) -> dict[str, torch.Tensor]: images = [] texts_values = [] image_text_info = [] @@ -69,7 +70,7 @@ def tokenize(self, texts, padding: str | bool = True) -> dict[str, torch.Tensor] encoding = self.processor.tokenizer(texts_values, return_tensors="pt", padding=padding) if len(images): - image_features = self.processor.image_processor(images, return_tensors="pt") + image_features = self.processor.image_processor(images, return_tensors="pt", input_data_format=input_data_format) encoding["pixel_values"] = image_features.pixel_values encoding["image_text_info"] = image_text_info diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 8c6defe9c..420f74f8e 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -28,6 +28,12 @@ from sentence_transformers.cross_encoder.CrossEncoder import CrossEncoder from sentence_transformers.SentenceTransformer import SentenceTransformer +class ImageChannelDimension(): + """ + Defines the color channels' position in an Image's shape + """ + FIRST = "channels_first" + LAST = "channels_last" def _convert_to_tensor(a: list | np.ndarray | Tensor) -> Tensor: """ From 33a4ebcf702a231f0b83e15cd6bc10d5b2c8a611 Mon Sep 17 00:00:00 2001 From: Davy Chen Date: Thu, 25 Jul 2024 20:10:38 +0800 Subject: [PATCH 2/3] # Made 2 modifications. 1. Add doc-string for newly added 'image_channel_dimension' parameter of 'encode' function. 2. Changed the parameter's name from 'input_data_format' to 'image_channel_dimension'. --- sentence_transformers/SentenceTransformer.py | 16 +++++++++------- sentence_transformers/models/CLIPModel.py | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index 25cd094d2..e0017e664 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -368,7 +368,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., - input_data_format: str = ImageChannelDimension.LAST, + image_channel_dimension: str = ImageChannelDimension.LAST, ) -> Tensor: ... @overload @@ -385,7 +385,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., - input_data_format: str = ImageChannelDimension.LAST, + image_channel_dimension: str = ImageChannelDimension.LAST, ) -> np.ndarray: ... @overload @@ -402,7 +402,7 @@ def encode( convert_to_tensor: Literal[True] = ..., device: str = ..., normalize_embeddings: bool = ..., - input_data_format: str = ImageChannelDimension.LAST, + image_channel_dimension: str = ImageChannelDimension.LAST, ) -> Tensor: ... @overload @@ -419,7 +419,7 @@ def encode( convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., - input_data_format: str = ImageChannelDimension.LAST, + image_channel_dimension: str = ImageChannelDimension.LAST, ) -> list[Tensor]: ... def encode( @@ -435,7 +435,7 @@ def encode( convert_to_tensor: bool = False, device: str = None, normalize_embeddings: bool = False, - input_data_format: str = ImageChannelDimension.LAST, + image_channel_dimension: str = ImageChannelDimension.LAST, ) -> list[Tensor] | np.ndarray | Tensor: """ Computes sentence embeddings. @@ -466,6 +466,8 @@ def encode( device (str, optional): Which :class:`torch.device` to use for the computation. Defaults to None. normalize_embeddings (bool, optional): Whether to normalize returned vectors to have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. Defaults to False. + image_channel_dimension (str, optional): Indicate the color channel of an image to be the first or the last element of its shape. + If your image is 'ImageChannelDimension.FIRST', the input is required. Returns: Union[List[Tensor], ndarray, Tensor]: By default, a 2d numpy array with shape [num_inputs, output_dimension] is returned. @@ -498,7 +500,7 @@ def encode( self.eval() # Will be used in Image Tokenizer - self.input_data_format = input_data_format + self.image_channel_dimension = image_channel_dimension if show_progress_bar is None: show_progress_bar = logger.getEffectiveLevel() in (logging.INFO, logging.DEBUG) @@ -1000,7 +1002,7 @@ def tokenize(self, texts: list[str] | list[dict] | list[tuple[str, str]]) -> dic Dict[str, Tensor]: A dictionary of tensors with the tokenized texts. Common keys are "input_ids", "attention_mask", and "token_type_ids". """ - return self._first_module().tokenize(texts, input_data_format=self.input_data_format) + return self._first_module().tokenize(texts, image_channel_dimension=self.image_channel_dimension) def get_sentence_features(self, *features) -> dict[Literal["sentence_embedding"], torch.Tensor]: return self._first_module().get_sentence_features(*features) diff --git a/sentence_transformers/models/CLIPModel.py b/sentence_transformers/models/CLIPModel.py index 1dacf830a..d030016f7 100644 --- a/sentence_transformers/models/CLIPModel.py +++ b/sentence_transformers/models/CLIPModel.py @@ -52,7 +52,7 @@ def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: return features - def tokenize(self, texts, padding: str | bool = True, input_data_format: str = ImageChannelDimension.LAST) -> dict[str, torch.Tensor]: + def tokenize(self, texts, padding: str | bool = True, image_channel_dimension: str = ImageChannelDimension.LAST) -> dict[str, torch.Tensor]: images = [] texts_values = [] image_text_info = [] @@ -70,7 +70,7 @@ def tokenize(self, texts, padding: str | bool = True, input_data_format: str = I encoding = self.processor.tokenizer(texts_values, return_tensors="pt", padding=padding) if len(images): - image_features = self.processor.image_processor(images, return_tensors="pt", input_data_format=input_data_format) + image_features = self.processor.image_processor(images, return_tensors="pt", input_data_format=image_channel_dimension) encoding["pixel_values"] = image_features.pixel_values encoding["image_text_info"] = image_text_info From e6830519601554a9eecf31bfd5668d7fceca7f29 Mon Sep 17 00:00:00 2001 From: Davy Chen Date: Sat, 27 Jul 2024 18:34:16 +0800 Subject: [PATCH 3/3] # Modified 2 files 1. To make the 'tokenize' interface compatible between Texts and Images. --- sentence_transformers/models/CLIPModel.py | 3 ++- sentence_transformers/models/Transformer.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sentence_transformers/models/CLIPModel.py b/sentence_transformers/models/CLIPModel.py index d030016f7..ff3442ef2 100644 --- a/sentence_transformers/models/CLIPModel.py +++ b/sentence_transformers/models/CLIPModel.py @@ -52,10 +52,11 @@ def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: return features - def tokenize(self, texts, padding: str | bool = True, image_channel_dimension: str = ImageChannelDimension.LAST) -> dict[str, torch.Tensor]: + def tokenize(self, texts, padding: str | bool = True, **kwargs) -> dict[str, torch.Tensor]: images = [] texts_values = [] image_text_info = [] + image_channel_dimension = kwargs.get("image_channel_dimension", ImageChannelDimension.LAST) for idx, data in enumerate(texts): if isinstance(data, Image.Image): # An Image diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 923c0586e..0953a9c8c 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -134,7 +134,7 @@ def get_word_embedding_dimension(self) -> int: return self.auto_model.config.hidden_size def tokenize( - self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True + self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True, **kwargs ) -> dict[str, torch.Tensor]: """Tokenizes a text and maps tokens to token-ids""" output = {}