huggingface · echarlaix · Aug 5, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -71,6 +71,7 @@
 )
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
+    CLIPModelPatcher,
     FalconModelPatcher,
     MistralModelPatcher,
     MusicgenModelPatcher,
@@ -911,10 +912,16 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
         return common_outputs
 
+    def patch_model_for_export(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> "ModelPatcher":
+        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class CLIPOnnxConfig(TextAndVisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig
-    DEFAULT_ONNX_OPSET = 14
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -933,6 +940,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             "image_embeds": {0: "image_batch_size"},
         }
 
+    def patch_model_for_export(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> "ModelPatcher":
+        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class SentenceTransformersCLIPOnnxConfig(CLIPOnnxConfig):
     @property
@@ -978,6 +992,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
         return common_outputs
 
+    def patch_model_for_export(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> "ModelPatcher":
+        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class CLIPTextOnnxConfig(CLIPTextWithProjectionOnnxConfig):
     @property
@@ -992,14 +1013,12 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
 
         return common_outputs
 
-    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
-        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
-
-        if framework == "pt":
-            import torch
-
-            dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32)
-        return dummy_inputs
+    def patch_model_for_export(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> "ModelPatcher":
+        return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 class UNetOnnxConfig(VisionOnnxConfig):

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -1131,3 +1131,20 @@ def __init__(
                 self._update_causal_mask_original = self._model.model._update_causal_mask
             else:
                 self._update_causal_mask_original = self._model._update_causal_mask
+
+
+class CLIPModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+
+        if _transformers_version >= version.parse("4.43"):
+            from transformers.models.clip.modeling_clip import CLIPAttention, CLIPSdpaAttention
+
+            self.original_sdpa_forward, CLIPSdpaAttention.forward = CLIPSdpaAttention.forward, CLIPAttention.forward
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if _transformers_version >= version.parse("4.43"):
+            from transformers.models.clip.modeling_clip import CLIPSdpaAttention
+
+            CLIPSdpaAttention.forward = self.original_sdpa_forward
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
@@ -96,7 +96,7 @@ def _get_submodels_for_export_diffusion(
         pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline)
     )
     is_stable_diffusion_xl = isinstance(
-        pipeline, (StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline)
+        pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline)
     )
     is_latent_consistency_model = isinstance(
         pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline)
@@ -150,7 +150,6 @@ def _get_submodels_for_export_diffusion(
 
     text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
     if text_encoder_2 is not None:
-        text_encoder_2.config.output_hidden_states = True
         models_for_export["text_encoder_2"] = text_encoder_2
 
     return models_for_export

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
@@ -452,10 +452,14 @@ def to(self, device: Union[torch.device, str, int]):
         Returns:
             `ORTModel`: the model placed on the requested device.
         """
+
         device, provider_options = parse_device(device)
         provider = get_provider_for_device(device)
         validate_provider_availability(provider)  # raise error if the provider is not available
-        self.device = device
+
+        if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
+            return self
+
         self.vae_decoder.session.set_providers([provider], provider_options=[provider_options])
         self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
         self.unet.session.set_providers([provider], provider_options=[provider_options])
@@ -464,6 +468,8 @@ def to(self, device: Union[torch.device, str, int]):
             self.vae_encoder.session.set_providers([provider], provider_options=[provider_options])
 
         self.providers = self.vae_decoder.session.get_providers()
+        self._device = device
+
         return self
 
     @classmethod

diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
@@ -276,7 +276,19 @@ def __init__(
 
         self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward)
 
-    # TODO: why do we make device a property since we are only access the value, and do not do any check when setting the value?
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the model.
+        """
+
+        for dtype in self.input_dtypes.values():
+            torch_dtype = TypeHelper.ort_type_to_torch_type(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
     @property
     def device(self) -> torch.device:
         """
@@ -286,8 +298,8 @@ def device(self) -> torch.device:
         return self._device
 
     @device.setter
-    def device(self, value: torch.device):
-        self._device = value
+    def device(self, **kwargs):
+        raise AttributeError("The device attribute is read-only, please use the `to` method to change the device.")
 
     @property
     def use_io_binding(self):
@@ -309,13 +321,13 @@ def to(self, device: Union[torch.device, str, int]):
         Returns:
             `ORTModel`: the model placed on the requested device.
         """
+
         device, provider_options = parse_device(device)
 
         if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
             return self
 
-        self.device = device
-        provider = get_provider_for_device(self.device)
+        provider = get_provider_for_device(device)
         validate_provider_availability(provider)  # raise error if the provider is not available
 
         # IOBinding is only supported for CPU and CUDA Execution Providers.
@@ -331,6 +343,7 @@ def to(self, device: Union[torch.device, str, int]):
 
         self.model.set_providers([provider], provider_options=[provider_options])
         self.providers = self.model.get_providers()
+        self._device = device
 
         return self
 

diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
@@ -1124,12 +1124,13 @@ def to(self, device: Union[torch.device, str, int]):
         provider = get_provider_for_device(device)
         validate_provider_availability(provider)  # raise error if the provider is not available
 
-        self.device = device
         self.encoder.session.set_providers([provider], provider_options=[provider_options])
         self.decoder.session.set_providers([provider], provider_options=[provider_options])
         if self.decoder_with_past is not None:
             self.decoder_with_past.session.set_providers([provider], provider_options=[provider_options])
+
         self.providers = self.encoder.session.get_providers()
+        self._device = device
 
         return self
 

diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.43.0",
+    "transformers[sentencepiece]>=4.26.0,<4.44.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569

diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
@@ -27,7 +27,7 @@
 
 MODELS_DICT = {
     "albert": "hf-internal-testing/tiny-random-AlbertModel",
-    "bark": "ylacombe/bark-small",  # TODO: put a smaller model, this one is 1.7GB...
+    "bark": "ylacombe/bark-small",
     "bart": "hf-internal-testing/tiny-random-bart",
     "bert": "hf-internal-testing/tiny-random-BertModel",
     "bert-generation": "ybelkada/random-tiny-BertGenerationModel",
@@ -359,7 +359,8 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True):
             for name, param in bt_model.named_parameters():
                 self.assertFalse(param.device.type == "meta", f"Parameter {name} is on the meta device.")
 
-            bt_model.save_pretrained(tmpdirname)
+            # saving a normal transformers bark model fails because of shared tensors
+            bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark")
 
             bt_model_from_load = AutoModel.from_pretrained(tmpdirname)