From 62174bd0127a3df60de0fcd073e5d6248247c8ad Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Mon, 4 Dec 2023 21:45:07 +0000 Subject: [PATCH 1/4] Add support for siglip models --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 19 +++++++++++++++++++ optimum/exporters/tasks.py | 13 +++++++++++++ optimum/utils/normalized_config.py | 1 + tests/exporters/exporters_utils.py | 1 + transformers | 1 + 6 files changed, 36 insertions(+) create mode 160000 transformers diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 0ea17b6afec..2d84e376c67 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -83,6 +83,7 @@ Supported architectures: - SEW - SEW-D - Speech2Text +- SigLIP - SpeechT5 - Splinter - SqueezeBert diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index f4d50ad58d4..d7c019f9d39 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -806,6 +806,7 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: + print('1 get inputs') return { "input_ids": {0: "text_batch_size", 1: "sequence_length"}, "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, @@ -836,8 +837,10 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: + print('2 get inputs') return { "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "text_batch_size", 1: "sequence_length"}, } @property @@ -876,6 +879,22 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): return dummy_inputs +class SiglipNormalizedConfig(CLIPNormalizedConfig): + pass + + +class SiglipOnnxConfig(CLIPOnnxConfig): + pass + + +class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig): + pass + + +class SiglipTextOnnxConfig(CLIPTextOnnxConfig): + pass + + class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-3 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 7545c72d6c6..e7ef41d99b6 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -864,6 +864,19 @@ class TasksManager: "audio-classification", onnx="SEWDOnnxConfig", ), + "siglip": supported_tasks_mapping( + "feature-extraction", + "zero-shot-image-classification", + onnx="SiglipOnnxConfig", + ), + "siglip-text-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextOnnxConfig", + ), + "siglip-text-with-projection": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextWithProjectionOnnxConfig", + ), "speech-to-text": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 7a0af9a1a48..e98586e5bc9 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -201,6 +201,7 @@ class NormalizedConfigManager: 'perceiver', 'roformer', 'segformer', + 'siglip', 'squeezebert', """ diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 6e43b65e34f..abd570a22a9 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -129,6 +129,7 @@ "roformer": "hf-internal-testing/tiny-random-RoFormerModel", # "sam": "fxmarty/sam-vit-tiny-random", # TODO: re-enable once PyTorch 2.1 is released, see https://github.com/huggingface/optimum/pull/1301 "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "siglip": "HuggingFaceM4/tiny-random-siglip", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "swin": "hf-internal-testing/tiny-random-SwinModel", diff --git a/transformers b/transformers new file mode 160000 index 00000000000..e2e6dc9a6df --- /dev/null +++ b/transformers @@ -0,0 +1 @@ +Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14 From 3860ac39ea7468cb97be2e76d46a0ad4ab53a3ec Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Mon, 4 Dec 2023 21:48:51 +0000 Subject: [PATCH 2/4] cleanup --- optimum/exporters/onnx/model_configs.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d7c019f9d39..66596a1bf70 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -806,7 +806,6 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: - print('1 get inputs') return { "input_ids": {0: "text_batch_size", 1: "sequence_length"}, "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, @@ -837,10 +836,8 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: - print('2 get inputs') return { "input_ids": {0: "batch_size", 1: "sequence_length"}, - "attention_mask": {0: "text_batch_size", 1: "sequence_length"}, } @property From 3e235380803ccc223e688b4e0ef7a1685b46554f Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Mon, 4 Dec 2023 21:50:19 +0000 Subject: [PATCH 3/4] remove submodule --- transformers | 1 - 1 file changed, 1 deletion(-) delete mode 160000 transformers diff --git a/transformers b/transformers deleted file mode 160000 index e2e6dc9a6df..00000000000 --- a/transformers +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14 From 94c332905cf9f0c3318cddce8e0ea1d3919be89e Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 23 Dec 2023 23:12:18 +0000 Subject: [PATCH 4/4] Remove attention mask from model input --- optimum/exporters/onnx/model_configs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 66596a1bf70..761fceb6f25 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -881,7 +881,16 @@ class SiglipNormalizedConfig(CLIPNormalizedConfig): class SiglipOnnxConfig(CLIPOnnxConfig): - pass + NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig + DEFAULT_ONNX_OPSET = 13 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, + # NOTE: No attention_mask + } class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):