diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 57005b85678..b5129c23f21 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -39,6 +39,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Decision Transformer - Deit - Detr +- DINOv2 - DistilBert - Donut-Swin - Electra @@ -53,6 +54,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - GPT-NeoX - OPT - GroupVit +- Hiera - Hubert - IBert - LayoutLM @@ -64,7 +66,9 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - M2-M100 - Marian - MarkupLM +- MaskFormer - MBart +- MGP-STR - Mistral - MobileBert - MobileVit @@ -74,6 +78,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - MT5 - Musicgen (text-conditional only) - Nystromformer +- OLMo +- OLMo2 - OWL-ViT - Pegasus - Perceiver @@ -81,6 +87,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Phi3 - Pix2Struct - PoolFormer +- PVT - Qwen2(Qwen1.5) - RegNet - RemBERT @@ -92,10 +99,12 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - SEW - SEW-D - Speech2Text +- SigLIP - SpeechT5 - Splinter - SqueezeBert - Swin +- SwinV2 - T5 - Table Transformer - TROCR @@ -103,6 +112,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - UniSpeech SAT - Vision Encoder Decoder - Vit +- VitMAE +- VitMSN - Wav2Vec2 - Wav2Vec2 Conformer - WavLM diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 63b98152dd8..050f0597aec 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -82,6 +82,7 @@ from .model_patcher import ( CLIPModelPatcher, FalconModelPatcher, + MgpstrModelPatcher, MistralModelPatcher, MusicgenModelPatcher, SAMModelPatcher, @@ -324,6 +325,15 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +class OlmoOnnxConfig(LlamaOnnxConfig): + ATOL_FOR_VALIDATION = 1e-4 + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") + + +class Olmo2OnnxConfig(OlmoOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.47.0") + + class Qwen2OnnxConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.37.0") @@ -837,6 +847,65 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class HieraOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class PvtOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class VitMAEOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class VitMSNOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class Dinov2DummyInputGenerator(DummyVisionInputGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + num_channels=num_channels, + width=width, + height=height, + **kwargs, + ) + + from transformers.onnx.utils import get_preprocessor + + preprocessor = get_preprocessor(normalized_config._name_or_path) + if preprocessor is not None and hasattr(preprocessor, "crop_size"): + self.height = preprocessor.crop_size.get("height", self.height) + self.width = preprocessor.crop_size.get("width", self.width) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + input_ = super().generate( + input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype + ) + return input_ + + +class Dinov2OnnxConfig(ViTOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,) + + class MobileViTOnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 DEFAULT_ONNX_OPSET = 11 @@ -878,6 +947,10 @@ class SwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class SwinV2OnnxConfig(SwinOnnxConfig): + pass + + class Swin2srOnnxConfig(SwinOnnxConfig): pass @@ -913,6 +986,28 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig): pass +class MaskFormerOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 12, try exporting with this version. + DEFAULT_ONNX_OPSET = 12 + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self.task == "image-segmentation": + return { + "class_queries_logits": {0: "batch_size", 1: "num_queries"}, + "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"}, + } + else: + return super().outputs + + @property + def torch_to_onnx_output_map(self) -> Dict[str, str]: + return { + "transformer_decoder_last_hidden_state": "last_hidden_state", + } + + class DonutSwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 @@ -933,6 +1028,21 @@ def torch_to_onnx_input_map(self) -> Dict[str, str]: return {"x": "pixel_values"} +class MgpstrOnnxConfig(ViTOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "char_logits": {0: "batch_size"}, + "bpe_logits": {0: "batch_size"}, + "wp_logits": {0: "batch_size"}, + } + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return MgpstrModelPatcher(self, model, model_kwargs=model_kwargs) + + class SentenceTransformersTransformerOnnxConfig(TextEncoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DEFAULT_ONNX_OPSET = 14 # Some bottleneck transformers models require a specific ONNX opset to be successfully exported. We put a rather high opset here for the export to work for all architectures. @@ -1090,6 +1200,39 @@ def patch_model_for_export( return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) +class SiglipNormalizedConfig(CLIPNormalizedConfig): + pass + + +class SiglipOnnxConfig(CLIPOnnxConfig): + NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, + # NOTE: No attention_mask + } + + +class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig): + pass + + +class SiglipTextOnnxConfig(CLIPTextOnnxConfig): + pass + + +class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 2c0f9aeba67..083bc127999 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -509,6 +509,32 @@ def patched_forward(*args, **kwargs): self.patched_forward = patched_forward +class MgpstrModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + signature = inspect.signature(self.orig_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + # logits is a tuple, so we unpack it and return them as separate outputs + char_logits, bpe_logits, wp_logits = self.orig_forward(*args, **kwargs).logits + + return { + "char_logits": char_logits, + "bpe_logits": bpe_logits, + "wp_logits": wp_logits, + } + + self.patched_forward = patched_forward + + class SAMModelPatcher(ModelPatcher): def __init__( self, diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 1394ad39ae5..4ffb63fd6aa 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -209,9 +209,14 @@ class TasksManager: "feature-extraction": "AutoModel", "fill-mask": "AutoModelForMaskedLM", "image-classification": "AutoModelForImageClassification", - "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), + "image-segmentation": ( + "AutoModelForImageSegmentation", + "AutoModelForSemanticSegmentation", + "AutoModelForInstanceSegmentation", + "AutoModelForUniversalSegmentation", + ), "image-to-image": "AutoModelForImageToImage", - "image-to-text": "AutoModelForVision2Seq", + "image-to-text": ("AutoModelForVision2Seq", "AutoModel"), "mask-generation": "AutoModel", "masked-im": "AutoModelForMaskedImageModeling", "multiple-choice": "AutoModelForMultipleChoice", @@ -224,6 +229,7 @@ class TasksManager: "text2text-generation": "AutoModelForSeq2SeqLM", "text-classification": "AutoModelForSequenceClassification", "token-classification": "AutoModelForTokenClassification", + "visual-question-answering": "AutoModelForVisualQuestionAnswering", "zero-shot-image-classification": "AutoModelForZeroShotImageClassification", "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection", } @@ -307,6 +313,7 @@ class TasksManager: "vision2seq-lm": "image-to-text", "zero-shot-classification": "text-classification", "image-feature-extraction": "feature-extraction", + "pretraining": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) "stable-diffusion": "text-to-image", @@ -601,6 +608,11 @@ class TasksManager: "image-segmentation", onnx="DetrOnnxConfig", ), + "dinov2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="Dinov2OnnxConfig", + ), "distilbert": supported_tasks_mapping( "feature-extraction", "fill-mask", @@ -732,6 +744,11 @@ class TasksManager: "feature-extraction", onnx="GroupViTOnnxConfig", ), + "hiera": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="HieraOnnxConfig", + ), "hubert": supported_tasks_mapping( "feature-extraction", "automatic-speech-recognition", @@ -813,6 +830,11 @@ class TasksManager: "question-answering", onnx="MarkupLMOnnxConfig", ), + "maskformer": supported_tasks_mapping( + "feature-extraction", + "image-segmentation", + onnx="MaskFormerOnnxConfig", + ), "mbart": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -824,6 +846,11 @@ class TasksManager: "question-answering", onnx="MBartOnnxConfig", ), + "mgp-str": supported_tasks_mapping( + "feature-extraction", + "image-to-text", + onnx="MgpstrOnnxConfig", + ), "mistral": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -949,6 +976,20 @@ class TasksManager: "text-generation-with-past", onnx="GraniteOnnxConfig", ), + "olmo": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="OlmoOnnxConfig", + ), + "olmo2": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="Olmo2OnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -992,6 +1033,11 @@ class TasksManager: "image-classification", onnx="PoolFormerOnnxConfig", ), + "pvt": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="PvtOnnxConfig", + ), "regnet": supported_tasks_mapping( "feature-extraction", "image-classification", @@ -1055,6 +1101,23 @@ class TasksManager: "audio-classification", onnx="SEWDOnnxConfig", ), + "siglip": supported_tasks_mapping( + "feature-extraction", + "zero-shot-image-classification", + onnx="SiglipOnnxConfig", + ), + "siglip-text-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextOnnxConfig", + ), + "siglip-text-with-projection": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextWithProjectionOnnxConfig", + ), + "siglip-vision-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipVisionModelOnnxConfig", + ), "speech-to-text": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -1087,6 +1150,12 @@ class TasksManager: "masked-im", onnx="SwinOnnxConfig", ), + "swinv2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + "masked-im", + onnx="SwinV2OnnxConfig", + ), "swin2sr": supported_tasks_mapping( "feature-extraction", "image-to-image", @@ -1133,7 +1202,19 @@ class TasksManager: onnx="VisionEncoderDecoderOnnxConfig", ), "vit": supported_tasks_mapping( - "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig" + "feature-extraction", + "image-classification", + "masked-im", + onnx="ViTOnnxConfig", + ), + "vit-mae": supported_tasks_mapping( + "feature-extraction", + onnx="VitMAEOnnxConfig", + ), + "vit-msn": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="VitMSNOnnxConfig", ), "vits": supported_tasks_mapping( "text-to-audio", @@ -1217,6 +1298,10 @@ class TasksManager: "unet-2d-condition", "vae-encoder", "vae-decoder", + "clip-text-model", + "clip-text-with-projection", + "siglip-text-model", + "siglip-text-with-projection", # redundant model types "trocr", # same as vision-encoder-decoder } @@ -2126,6 +2211,7 @@ def get_model_from_task( use_auth_token = model_kwargs.pop("use_auth_token", None) token = model_kwargs.pop("token", None) trust_remote_code = model_kwargs.pop("trust_remote_code", False) + model_kwargs["torch_dtype"] = torch_dtype if use_auth_token is not None: warnings.warn( @@ -2143,6 +2229,7 @@ def get_model_from_task( token=token, revision=revision, trust_remote_code=trust_remote_code, + model_kwargs=model_kwargs, ) else: try: diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 8e5a814b689..a55eb064fa3 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -1696,7 +1696,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForImageClassification(ORTModel): """ - ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit. + ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, dinov2, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, swinv2, vit. """ auto_model_class = AutoModelForImageClassification @@ -1784,7 +1784,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForSemanticSegmentation(ORTModel): """ - ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports segformer. + ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports maskformer, segformer. """ auto_model_class = AutoModelForSemanticSegmentation diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 9e92e0bd325..79375d958ff 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -178,6 +178,7 @@ def check_optimization_supported_model(cls, model_type: str, optimization_config "clip", "vit", "swin", + "swinv2", ] model_type = model_type.replace("_", "-") if (model_type not in cls._conf) or (cls._conf[model_type] not in supported_model_types_for_optimization): diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 9ceed24c2dd..9fde2bd4696 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -204,8 +204,10 @@ class NormalizedConfigManager: 'data2vec-text', 'data2vec-vision', 'detr', + 'dinov2', 'flaubert', 'groupvit', + 'hiera', 'ibert', 'layoutlm', 'layoutlmv3', @@ -216,6 +218,8 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', + 'segformer', + 'siglip', 'squeezebert', 'table-transformer', """ diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 680ae7e91b8..6fdffd132fa 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -69,6 +69,7 @@ "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium", "deit": "hf-internal-testing/tiny-random-DeiTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", "detr": "hf-internal-testing/tiny-random-DetrModel", # hf-internal-testing/tiny-random-detr is larger @@ -103,6 +104,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "ibert": "hf-internal-testing/tiny-random-IBertModel", "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel", "levit": "hf-internal-testing/tiny-random-LevitModel", @@ -115,7 +117,9 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet-v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -126,6 +130,8 @@ "mt5": "lewtun/tiny-random-mt5", "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "hf-internal-testing/tiny-random-OlmoForCausalLM", + "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM", "opt": "hf-internal-testing/tiny-random-OPTModel", "owlv2": "hf-internal-testing/tiny-random-Owlv2Model", "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel", @@ -140,6 +146,7 @@ # "rembert": "google/rembert", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "regnet": "hf-internal-testing/tiny-random-RegNetModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -148,13 +155,18 @@ "rt-detr": "PekingU/rtdetr_r18vd", "sam": "fxmarty/sam-vit-tiny-random", "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", + "siglip-vision-model": "hf-internal-testing/tiny-random-SiglipVisionModel", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRModel", "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel", + "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification", "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken @@ -235,6 +247,7 @@ "gpt-neox": "EleutherAI/gpt-neox-20b", "gptj": "anton-l/gpt-j-tiny-random", # TODO "groupvit": "nvidia/groupvit-gcc-yfcc", + "hiera": "facebook/hiera-tiny-224-in1k-hf", "ibert": "kssteven/ibert-roberta-base", "imagegpt": "openai/imagegpt-small", "levit": "facebook/levit-128S", @@ -247,7 +260,9 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", # Not using facebook/m2m100_418M because it takes too much time for testing. "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "facebook/maskformer-swin-tiny-coco", "mbart": "sshleifer/tiny-mbart", + "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", # "mobilenet_v1": "google/mobilenet_v1_0.75_192", # "mobilenet_v2": "google/mobilenet_v2_0.35_96", @@ -261,6 +276,7 @@ "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "regnet": "facebook/regnet-y-040", "resnet": "microsoft/resnet-50", "roberta": "roberta-base", @@ -268,12 +284,16 @@ "rt-detr": "PekingU/rtdetr_r101vd", "sam": "facebook/sam-vit-base", "segformer": "nvidia/segformer-b0-finetuned-ade-512-512", + "siglip": "google/siglip-base-patch16-224", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "squeezebert/squeezebert-uncased", "swin": "microsoft/swin-tiny-patch4-window7-224", + "swinv2": "microsoft/swinv2-tiny-patch4-window16-256", "t5": "t5-small", "table-transformer": "microsoft/table-transformer-detection", "vit": "google/vit-base-patch16-224", + "vit-mae": "facebook/vit-mae-base", + "vit-msn": "facebook/vit-msn-small", "yolos": "hustvl/yolos-tiny", "whisper": "openai/whisper-tiny.en", "hubert": "facebook/hubert-base-ls960", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 8f52ef45180..255c0d9d0e7 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2827,6 +2827,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin): "convnextv2", "data2vec_vision", "deit", + "dinov2", "levit", "mobilenet_v1", "mobilenet_v2", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index cccecd53817..02ced3be3aa 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -87,8 +87,9 @@ "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "detr": "hf-internal-testing/tiny-random-detr", - "dpt": "hf-internal-testing/tiny-random-DPTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", + "dpt": "hf-internal-testing/tiny-random-DPTModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "encoder-decoder": { "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ @@ -107,6 +108,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", "latent-consistency": "echarlaix/tiny-random-latent-consistency", @@ -118,6 +120,7 @@ "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", @@ -134,6 +137,7 @@ "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -142,12 +146,14 @@ "segformer": "hf-internal-testing/tiny-random-SegformerModel", "sew": "hf-internal-testing/tiny-random-SEWModel", "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution", "t5": "hf-internal-testing/tiny-random-t5",