Merge branch 'main' into rt-detr-onnx

huggingface · Dec 19, 2024 · dd2a2a2 · dd2a2a2
2 parents f25f227 + 0c42291
commit dd2a2a2
Show file tree

Hide file tree

Showing 10 changed files with 305 additions and 6 deletions.
diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
@@ -39,6 +39,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Decision Transformer
 - Deit
 - Detr
+- DINOv2
 - DistilBert
 - Donut-Swin
 - Electra
@@ -53,6 +54,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - GPT-NeoX
 - OPT
 - GroupVit
+- Hiera
 - Hubert
 - IBert
 - LayoutLM
@@ -64,7 +66,9 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - M2-M100
 - Marian
 - MarkupLM
+- MaskFormer
 - MBart
+- MGP-STR
 - Mistral
 - MobileBert
 - MobileVit
@@ -74,13 +78,16 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - MT5
 - Musicgen (text-conditional only)
 - Nystromformer
+- OLMo
+- OLMo2
 - OWL-ViT
 - Pegasus
 - Perceiver
 - Phi
 - Phi3
 - Pix2Struct
 - PoolFormer
+- PVT
 - Qwen2(Qwen1.5)
 - RegNet
 - RemBERT
@@ -92,17 +99,21 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - SEW
 - SEW-D
 - Speech2Text
+- SigLIP
 - SpeechT5
 - Splinter
 - SqueezeBert
 - Swin
+- SwinV2
 - T5
 - Table Transformer
 - TROCR
 - UniSpeech
 - UniSpeech SAT
 - Vision Encoder Decoder
 - Vit
+- VitMAE
+- VitMSN
 - Wav2Vec2
 - Wav2Vec2 Conformer
 - WavLM

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -82,6 +82,7 @@
 from .model_patcher import (
     CLIPModelPatcher,
     FalconModelPatcher,
+    MgpstrModelPatcher,
     MistralModelPatcher,
     MusicgenModelPatcher,
     SAMModelPatcher,
@@ -324,6 +325,15 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
+class OlmoOnnxConfig(LlamaOnnxConfig):
+    ATOL_FOR_VALIDATION = 1e-4
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+
+
+class Olmo2OnnxConfig(OlmoOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.47.0")
+
+
 class Qwen2OnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.37.0")
 
@@ -837,6 +847,65 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class HieraOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
+class PvtOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
+class VitMAEOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
+class VitMSNOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
+class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            **kwargs,
+        )
+
+        from transformers.onnx.utils import get_preprocessor
+
+        preprocessor = get_preprocessor(normalized_config._name_or_path)
+        if preprocessor is not None and hasattr(preprocessor, "crop_size"):
+            self.height = preprocessor.crop_size.get("height", self.height)
+            self.width = preprocessor.crop_size.get("width", self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input_ = super().generate(
+            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        return input_
+
+
+class Dinov2OnnxConfig(ViTOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,)
+
+
 class MobileViTOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
     DEFAULT_ONNX_OPSET = 11
@@ -878,6 +947,10 @@ class SwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class SwinV2OnnxConfig(SwinOnnxConfig):
+    pass
+
+
 class Swin2srOnnxConfig(SwinOnnxConfig):
     pass
 
@@ -913,6 +986,28 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig):
     pass
 
 
+class MaskFormerOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 12, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 12
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self.task == "image-segmentation":
+            return {
+                "class_queries_logits": {0: "batch_size", 1: "num_queries"},
+                "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"},
+            }
+        else:
+            return super().outputs
+
+    @property
+    def torch_to_onnx_output_map(self) -> Dict[str, str]:
+        return {
+            "transformer_decoder_last_hidden_state": "last_hidden_state",
+        }
+
+
 class DonutSwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
@@ -933,6 +1028,21 @@ def torch_to_onnx_input_map(self) -> Dict[str, str]:
         return {"x": "pixel_values"}
 
 
+class MgpstrOnnxConfig(ViTOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "char_logits": {0: "batch_size"},
+            "bpe_logits": {0: "batch_size"},
+            "wp_logits": {0: "batch_size"},
+        }
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MgpstrModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 class SentenceTransformersTransformerOnnxConfig(TextEncoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DEFAULT_ONNX_OPSET = 14  # Some bottleneck transformers models require a specific ONNX opset to be successfully exported. We put a rather high opset here for the export to work for all architectures.
@@ -1090,6 +1200,39 @@ def patch_model_for_export(
         return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+class SiglipNormalizedConfig(CLIPNormalizedConfig):
+    pass
+
+
+class SiglipOnnxConfig(CLIPOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+            "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            # NOTE: No attention_mask
+        }
+
+
+class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    pass
+
+
+class SiglipTextOnnxConfig(CLIPTextOnnxConfig):
+    pass
+
+
+class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
 class UNetOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -509,6 +509,32 @@ def patched_forward(*args, **kwargs):
         self.patched_forward = patched_forward
 
 
+class MgpstrModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        @functools.wraps(self.orig_forward)
+        def patched_forward(*args, **kwargs):
+            signature = inspect.signature(self.orig_forward)
+            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
+
+            # logits is a tuple, so we unpack it and return them as separate outputs
+            char_logits, bpe_logits, wp_logits = self.orig_forward(*args, **kwargs).logits
+
+            return {
+                "char_logits": char_logits,
+                "bpe_logits": bpe_logits,
+                "wp_logits": wp_logits,
+            }
+
+        self.patched_forward = patched_forward
+
+
 class SAMModelPatcher(ModelPatcher):
     def __init__(
         self,