From 62174bd0127a3df60de0fcd073e5d6248247c8ad Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:45:07 +0000
Subject: [PATCH 1/4] Add support for siglip models

---
 docs/source/exporters/onnx/overview.mdx |  1 +
 optimum/exporters/onnx/model_configs.py | 19 +++++++++++++++++++
 optimum/exporters/tasks.py              | 13 +++++++++++++
 optimum/utils/normalized_config.py      |  1 +
 tests/exporters/exporters_utils.py      |  1 +
 transformers                            |  1 +
 6 files changed, 36 insertions(+)
 create mode 160000 transformers

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 0ea17b6afec..2d84e376c67 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -83,6 +83,7 @@ Supported architectures:
 - SEW
 - SEW-D
 - Speech2Text
+- SigLIP
 - SpeechT5
 - Splinter
 - SqueezeBert
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index f4d50ad58d4..d7c019f9d39 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -806,6 +806,7 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
+        print('1 get inputs')
         return {
             "input_ids": {0: "text_batch_size", 1: "sequence_length"},
             "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
@@ -836,8 +837,10 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
+        print('2 get inputs')
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
         }
 
     @property
@@ -876,6 +879,22 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         return dummy_inputs
 
 
+class SiglipNormalizedConfig(CLIPNormalizedConfig):
+    pass
+
+
+class SiglipOnnxConfig(CLIPOnnxConfig):
+    pass
+
+
+class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    pass
+
+
+class SiglipTextOnnxConfig(CLIPTextOnnxConfig):
+    pass
+
+
 class UNetOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 7545c72d6c6..e7ef41d99b6 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -864,6 +864,19 @@ class TasksManager:
             "audio-classification",
             onnx="SEWDOnnxConfig",
         ),
+        "siglip": supported_tasks_mapping(
+            "feature-extraction",
+            "zero-shot-image-classification",
+            onnx="SiglipOnnxConfig",
+        ),
+        "siglip-text-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextOnnxConfig",
+        ),
+        "siglip-text-with-projection": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextWithProjectionOnnxConfig",
+        ),
         "speech-to-text": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 7a0af9a1a48..e98586e5bc9 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -201,6 +201,7 @@ class NormalizedConfigManager:
         'perceiver',
         'roformer',
         'segformer',
+        'siglip',
         'squeezebert',
     """
 
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 6e43b65e34f..abd570a22a9 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -129,6 +129,7 @@
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
     # "sam": "fxmarty/sam-vit-tiny-random",  # TODO: re-enable once PyTorch 2.1 is released, see https://github.com/huggingface/optimum/pull/1301
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
+    "siglip": "HuggingFaceM4/tiny-random-siglip",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
diff --git a/transformers b/transformers
new file mode 160000
index 00000000000..e2e6dc9a6df
--- /dev/null
+++ b/transformers
@@ -0,0 +1 @@
+Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14

From 3860ac39ea7468cb97be2e76d46a0ad4ab53a3ec Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:48:51 +0000
Subject: [PATCH 2/4] cleanup

---
 optimum/exporters/onnx/model_configs.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d7c019f9d39..66596a1bf70 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -806,7 +806,6 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        print('1 get inputs')
         return {
             "input_ids": {0: "text_batch_size", 1: "sequence_length"},
             "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
@@ -837,10 +836,8 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        print('2 get inputs')
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
-            "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
         }
 
     @property

From 3e235380803ccc223e688b4e0ef7a1685b46554f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:50:19 +0000
Subject: [PATCH 3/4] remove submodule

---
 transformers | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 transformers

diff --git a/transformers b/transformers
deleted file mode 160000
index e2e6dc9a6df..00000000000
--- a/transformers
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14

From 94c332905cf9f0c3318cddce8e0ea1d3919be89e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sat, 23 Dec 2023 23:12:18 +0000
Subject: [PATCH 4/4] Remove attention mask from model input

---
 optimum/exporters/onnx/model_configs.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 66596a1bf70..761fceb6f25 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -881,7 +881,16 @@ class SiglipNormalizedConfig(CLIPNormalizedConfig):
 
 
 class SiglipOnnxConfig(CLIPOnnxConfig):
-    pass
+    NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig
+    DEFAULT_ONNX_OPSET = 13
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+            "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            # NOTE: No attention_mask
+        }
 
 
 class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):